In [1]:
from sklearn.datasets import fetch_20newsgroups # newgroups dataset 
from sklearn.naive_bayes import MultinomialNB # for model 

# data processing 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
# newsgroups to download 
newsgroup_names = ['comp.graphics', 'rec.sport.hockey', 'sci.electronics', 'sci.space']

# get data 
newsgroups = fetch_20newsgroups(categories=newsgroup_names, shuffle=True, random_state=265)
newsgroups.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [3]:
# convert text into numbers that rep each word
word_vector = CountVectorizer()
word_vector_counts = word_vector.fit_transform(newsgroups.data)

# get frequency of each word 
term_freq_transformer = TfidfTransformer()
term_freq = term_freq_transformer.fit_transform(word_vector_counts)

In [4]:
model = MultinomialNB().fit(term_freq, newsgroups.target) # train model

In [5]:
# for fancy formatting, https://stackoverflow.com/questions/8924173/how-do-i-print-bold-text-in-python
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [6]:
# predict some fake test documents 
test_docs = [
    'That GPU has amazing performance with a lot of shaders',
    'The player had a wicked slap shot',
    'I spent all day yesterday soldering banks of capacitors',
    'Today I have to solder a bank of capacitors',
    'NASA has rovers on Mars']
test_counts = word_vector.transform(test_docs)
test_term_freq = term_freq_transformer.transform(test_counts)

predictions = model.predict(test_term_freq)
print(f'{color.BOLD}Predictions:{color.END}')
for doc, group in zip(test_docs, predictions):
    print(f'\t{doc} — {color.UNDERLINE}{newsgroups.target_names[group]}{color.END}')

probabilities = model.predict_proba(test_term_freq)
print(f'\n{color.BOLD}Probabilities:{color.END}')
print(''.join(['{:20}'.format(name) for name in newsgroups.target_names]))
for probs in probabilities:
    print(''.join(['{:<20.7}'.format(prob) for prob in probs]))

[1mPredictions:[0m
	That GPU has amazing performance with a lot of shaders — [4mcomp.graphics[0m
	The player had a wicked slap shot — [4mrec.sport.hockey[0m
	I spent all day yesterday soldering banks of capacitors — [4msci.space[0m
	Today I have to solder a bank of capacitors — [4msci.electronics[0m
	NASA has rovers on Mars — [4msci.space[0m

[1mProbabilities:[0m
comp.graphics       rec.sport.hockey    sci.electronics     sci.space           
0.2946615           0.2289515           0.2492634           0.2271236           
0.1294805           0.511557            0.1824871           0.1764754           
0.1860481           0.2411777           0.2754045           0.2973696           
0.2128509           0.210813            0.3486507           0.2276854           
0.07918563          0.06622591          0.1023662           0.7522222           


# Exercise Option #1 - Standard Difficulty

As seen below, words that had significant effects on the model's probabilities include GPU, player, capacitors, and NASA. 

In [7]:
new_test_docs = [
    'That had an amazing performance with a lot of shaders', # replaced 'GPU', should make a signifcant difference
    'They had a wicked slap shot', # replaced 'player', should make a significant difference
    'I spent all day yesterday soldering', # removed 'banks of capacitors', should make a significant difference
    'I had to solder a bank of capacitors', # removed 'today', shouldn't make a significant difference 
    'They have rovers on Mars'] # replaced 'NASA', should make a significant difference

new_test_counts = word_vector.transform(new_test_docs)
new_test_term_freq = term_freq_transformer.transform(new_test_counts)

new_predictions = model.predict(new_test_term_freq)
print(f'{color.BOLD}New Predictions:{color.END}')
for doc, group in zip(new_test_docs, new_predictions):
    print(f'\t{doc} — {color.UNDERLINE}{newsgroups.target_names[group]}{color.END}')
    
new_probabilities = model.predict_proba(new_test_term_freq)
print(f'\n{color.BOLD}New Probabilities:{color.END}')
print(''.join(['{:20}'.format(name) for name in newsgroups.target_names]))
for new_probs in new_probabilities:
    print(''.join(['{:<20.7}'.format(new_prob) for new_prob in new_probs]))

print(f'\n{color.BOLD}Probability Differences:{color.END}')
print(''.join(['{:20}'.format(name) for name in newsgroups.target_names]))
for i in range(len(new_probs)+1):
    diff_arr = []
    for old_prob, new_prob in zip(probabilities[i], new_probabilities[i]):
        diff_arr.append('{:<20.7}'.format(old_prob-new_prob))
    print(''.join(diff_arr))

confusing_test_doc = [
    'The hockey player was awarded the NASA rover on Mars, which is powered by several cutting-edge RTX graphics cards that have many shaders, and was soldered together by scientists'
]

confusing_test_counts = word_vector.transform(confusing_test_doc)
confusing_test_term_freq = term_freq_transformer.transform(confusing_test_counts)

confusing_test_probabilities = model.predict_proba(confusing_test_term_freq)
print(f'\n{color.BOLD}Probabilities for confusing document:{color.END}')
print(''.join(['{:20}'.format(name) for name in newsgroups.target_names]))
print(''.join(['{:<20.7}'.format(confusing_prob) for confusing_prob in confusing_test_probabilities[0]]))

[1mNew Predictions:[0m
	That had an amazing performance with a lot of shaders — [4mcomp.graphics[0m
	They had a wicked slap shot — [4mrec.sport.hockey[0m
	I spent all day yesterday soldering — [4msci.space[0m
	I had to solder a bank of capacitors — [4msci.electronics[0m
	They have rovers on Mars — [4msci.space[0m

[1mNew Probabilities:[0m
comp.graphics       rec.sport.hockey    sci.electronics     sci.space           
0.2650535           0.2396654           0.256186            0.2390952           
0.1564945           0.3906022           0.200399            0.2525043           
0.1933554           0.2764627           0.1880132           0.3421687           
0.2151679           0.2106334           0.3584022           0.2157965           
0.1024882           0.1580019           0.1478589           0.591651            

[1mProbability Differences:[0m
comp.graphics       rec.sport.hockey    sci.electronics     sci.space           
0.02960796          -0.01071386         -0.0

# Exercise Option #2 - Advanced Difficulty

The results of the word counts surprised me. Throughout all the categories, including comp.graphics, gpu was never mentioned. The counts for player and nasa aligned with what I predicted, although the count for capacitors in sci.electronics was lower than I thought it would end up being.

In [8]:
# citation: got help from Huxley 
# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html
category_graphics = ' '.join(fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), categories=['comp.graphics']).data)
category_hockey = ' '.join(fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), categories=['rec.sport.hockey']).data)
category_electronics = ' '.join(fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), categories=['sci.electronics']).data)
category_space = ' '.join(fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), categories=['sci.space']).data)
categories = [category_graphics, category_hockey, category_electronics, category_space]

In [9]:
print(category_graphics)

Does ANYONE out there in Net-land have any information on the Cobra 2.20 
card?  The sticker on the end of the card reads
        Model: Cobra 1-B-1
        Bios:  Cobra v2.20

I Havn't been able to find anything about it from anyone!  If you have 
any information on how to get a hold of the company which produces the 
card or know where any drivers are for it, PLEASE let me know!

As far as I can tell, it's a CGA card that is taking up 2 of my 16-bit 
ISA slots but when I enable the test patterns, it displays much more than 
the usualy 4 CGA colors... At least 16 from what I can count.. Thanks!

              .------------------------------------------.
              : Internet: jele@eis.calstate.edu          :
              :           bbs.mirage@gilligan.tsoft.net  :
              :           bbs.mirage@tsoft.sf-bay.org    :
              :           mirage@thetech.com             :
              : UUCP    : apple.com!tsoft!bbs.mirage     :
              `---------------------------

In [10]:
gpu_occurances = []
player_occurances = []
capacitors_occurances = []
nasa_occurances = []
# https://www.programiz.com/python-programming/methods/string/count
for category in categories: 
    gpu_occurances.append(category.lower().count(" gpu "))
    player_occurances.append(category.lower().count(" player "))
    capacitors_occurances.append(category.lower().count(" capacitors "))
    nasa_occurances.append(category.lower().count(" nasa "))

In [11]:
print(f'{color.BOLD}Counts:{color.END}')
print(''.join(['{:17}'.format(name) for name in ['word', *newsgroups.target_names]]))
print(''.join(['{:<17}'.format(gpu_occurance) for gpu_occurance in ['gpu', *gpu_occurances]]))
print(''.join(['{:<17}'.format(player_occurance) for player_occurance in ['player', *player_occurances]]))
print(''.join(['{:<17}'.format(capacitors_occurance) for capacitors_occurance in ['capacitors', *capacitors_occurances]]))
print(''.join(['{:<17}'.format(nasa_occurance) for nasa_occurance in ['nasa', *nasa_occurances]]))

[1mCounts:[0m
word             comp.graphics    rec.sport.hockey sci.electronics  sci.space        
gpu              0                0                0                0                
player           4                79               8                0                
capacitors       0                0                7                0                
nasa             8                0                0                188              
