# Demo 17 - Naive Bayes and Logistic Regressions

In [2]:
import nltk
import sklearn
import pandas as pd
import numpy as np

## Multiplying Probabilities

In [3]:
np.random.rand() * np.random.rand() * np.random.rand()

0.13311109186413153

In [5]:
np.random.rand()

0.9995835362899248

In [6]:
rand_number = np.random.rand() 
for i in range(10):
    rand_number *= np.random.rand() 
rand_number

0.00016670105472496005

In [7]:
rand_number = np.random.rand() 
for i in range(100):
    rand_number *= np.random.rand() 
rand_number

7.388162536859592e-46

In [8]:
rand_number = np.random.rand() 
for i in range(500):
    rand_number *= np.random.rand() 
rand_number

1.6866754839815355e-216

In [9]:
rand_number = np.random.rand() 
for i in range(600):
    rand_number *= np.random.rand() 
rand_number

8.616630815473531e-250

In [10]:
rand_number = np.random.rand() 
for i in range(1000):
    rand_number *= np.random.rand() 
rand_number

0.0

(back to slides)

#### Logs to the rescue

In [11]:
rand_number = np.log(np.random.rand())
for i in range(1000):
    rand_number += np.log(np.random.rand()) 
rand_number

-1005.6900271981366

## Sklearn

### Movie Reviews

In [12]:
moview_reviews = nltk.corpus.movie_reviews

In [17]:
len(moview_reviews.fileids())

2000

In [18]:
#neg_review, pos_review = [], []
review_files = [(file_id, file_id.startswith("pos")) for file_id in moview_reviews.fileids()]

len(review_files)

2000

In [19]:
review_files[:5]

[('neg/cv000_29416.txt', False),
 ('neg/cv001_19502.txt', False),
 ('neg/cv002_17424.txt', False),
 ('neg/cv003_12683.txt', False),
 ('neg/cv004_12641.txt', False)]

In [20]:
df = pd.DataFrame(review_files)
df

Unnamed: 0,0,1
0,neg/cv000_29416.txt,False
1,neg/cv001_19502.txt,False
2,neg/cv002_17424.txt,False
3,neg/cv003_12683.txt,False
4,neg/cv004_12641.txt,False
...,...,...
1995,pos/cv995_21821.txt,True
1996,pos/cv996_11592.txt,True
1997,pos/cv997_5046.txt,True
1998,pos/cv998_14111.txt,True


In [21]:
df = df.rename(columns={0: "file_name", 1: "gold-label"})
df

Unnamed: 0,file_name,gold-label
0,neg/cv000_29416.txt,False
1,neg/cv001_19502.txt,False
2,neg/cv002_17424.txt,False
3,neg/cv003_12683.txt,False
4,neg/cv004_12641.txt,False
...,...,...
1995,pos/cv995_21821.txt,True
1996,pos/cv996_11592.txt,True
1997,pos/cv997_5046.txt,True
1998,pos/cv998_14111.txt,True


In [22]:
def read_mov_review(f_name):
    return moview_reviews.open(f_name).read()

df['review_text'] = df['file_name'].apply(read_mov_review)

In [23]:
df['review_text']

0       plot : two teen couples go to a church party ,...
1       the happy bastard's quick movie review \ndamn ...
2       it is movies like these that make a jaded movi...
3        " quest for camelot " is warner bros . ' firs...
4       synopsis : a mentally unstable man undergoing ...
                              ...                        
1995    wow ! what a movie . \nit's everything a movie...
1996    richard gere can be a commanding actor , but h...
1997    glory--starring matthew broderick , denzel was...
1998    steven spielberg's second epic film on world w...
1999    truman ( " true-man " ) burbank is the perfect...
Name: review_text, Length: 2000, dtype: object

In [27]:
df[df['review_text'].map(lambda x: len(x) < 100)]['review_text'].iloc[0]

"this film is extraordinarily horrendous and i'm not going to waste any more words on it . \n"

In [28]:
df[df['review_text'].map(lambda x: len(x) < 100)]['gold-label'].iloc[0]

False

#### Make train/dev/test split

In [29]:
df = df.sample(df.shape[0])
df.head(5)

Unnamed: 0,file_name,gold-label,review_text
918,neg/cv918_27080.txt,False,"you know something , christmas is not about pr..."
411,neg/cv411_16799.txt,False,"confucius once said , "" governing a nation is ..."
765,neg/cv765_20429.txt,False,it seems that i've stopped enjoying movies tha...
208,neg/cv208_9475.txt,False,"one would think that david duchovny , star of ..."
1726,pos/cv726_4719.txt,True,copyright 1996 graeme huggan carry on henry is...


In [30]:
df.shape[0]

2000

Let's make 80:10:10 split of our data

In [31]:
train_max_idx = int(df.shape[0] * .8)
dev_max_idx = int((df.shape[0] * .1) + train_max_idx)


train_max_idx, dev_max_idx

(1600, 1800)

In [32]:
train_df = df.iloc[:train_max_idx]
dev_df = df.iloc[train_max_idx:dev_max_idx]
test_df = df.iloc[dev_max_idx:]

train_df.shape, dev_df.shape, test_df.shape

((1600, 3), (200, 3), (200, 3))

We will train models on `train_df`, change different hyper-parameters based on `dev_df`, and then evaluate the model on `test_df` once we are done changing our different hyperparameters.

### Naive Bayes

In [33]:
from sklearn.naive_bayes import MultinomialNB

https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB

In [34]:
nb_model = MultinomialNB()
nb_model

MultinomialNB()

In [35]:
nb_model.fit(train_df['review_text'], train_df['gold-label'])

ValueError: could not convert string to float: 'you know something , christmas is not about presents . \nit\'s about over-hyped holiday films with lots of merchandising and product tie-ins . \nat least that would seem to be the message of " the grinch , " which has been advertised since last christmas and whose logo is currently plastered all over stores . \nhollywood expects us to ignore this cynical greed as the movie scolds us about losing the true spirit of the season . \nyou know the plot : there\'s this evil furry green guy called the grinch ( jim carrey ) who lives on a mountain overlooking whoville . \ndown below all the whos are preparing for their whobilation , but the grinch is determined to steal their christmas . \nthe movie is , of course , a live-action version of the beloved children\'s book , which was previously adapted into a 1966 tv special by looney tunes animator chuck jones . \nit\'s rare that a big budget hollywood release is shamed by a thirty-year-old half-hour cartoon , but that\'s the case when jones\' version is compared to ron howard\'s . \nthe tv grinch hit all the right notes : boris karloff\'s soft , deep narration ; thurl ravenscroft singing " mr . grinch " ; max the dog weighed down by the gigantic antler tied to his head ; and the grinch\'s wide , toothless grin . \nby contrast , the movie hits one sour note after another . \nfirst , there are the numerous bad choices that jeffrey price and peter seaman made in padding out the short book into a 105-minute movie . \nyoung wide-eyed cindy lou who ( taylor momsen ) is depressed about the misplaced priorities of her parents ( bill irwin and molly shannon ) during the holiday season . \nshe begins to sympathize with the grinch , who turns out to be surprisingly sympathetic . \ncindy lou discovers that the grinch turned tearfully away from whoville in grammar school when he was publicly humiliated while expressing his love for the prettiest girl in the class , martha may whovier ( played as an adult by christine baranski ) . \nare we expected to like the grinch , hate the whos , and want him to steal christmas ? \nthe grinch isn\'t even the villain here ; that role is filled by the corrupt mayor of whoville ( jeffrey tambor ) who was the grinch\'s rival for martha\'s affection . \nnot only are the characters needlessly complex , but the once-simple plot becomes so convoluted that the actual theft of christmas seems like an afterthought . \nthe casting choices aren\'t any better than the screenwriting decisions . \njim carrey seems woefully miscast . \nwhile his face is so supple that the rubber make-up seems superfluous , carrey brings nothing else to the role . \nhis accent keeps changing ; i assume he was shooting for karloff , but he ends up sounding like a weird slurry of richard nixon , sean connery , and cartman from " south park . " \nnot knowing what else to do during his many scenes alone in the grinch\'s home , carrey falls back on his stand-up comedy and clowns around ace ventura-style . \nneedless to say , his wise-cracking antics don\'t quite fit the character of the grinch , who is an embittered loner filled with hate . \nlittle taylor momsen brings little to role of cindy lou except big eyes and a cute smile . \nshe would have been fine if cindy lou was limited to her original purpose in the seuss story ( finding " santa " in her living room stealing the tree ) . \nhowever , the expanded script makes cindy as important as the grinch , and momsen is not up to the challenge . \nbottom line : when you\'re stealing christmas ( movies ) , leave this one behind . \n'

**Question:** What happened?

Solution: We need to convert the text to numbers, actually features

(back to slides)

#### Feature Engineering

We need to convert each document into features. 

**Question:** What should our features be?

*Hint*: What's the most basic approach we've taken so far in this class?

<details>
<summary>Solution</summary>
    Bag of words
</details>

In [None]:
# skip next bunch of empty cells, answer below

In [53]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [54]:
vectorizer.fit_transform(train_df['review_text'])

<1600x36174 sparse matrix of type '<class 'numpy.int64'>'
	with 531505 stored elements in Compressed Sparse Row format>

**Question:** What do these numbers that were printed out mean?

<details>
<summary>Solution</summary>
Number of examples, size of vocabulary
</details>

In [55]:
X = vectorizer.fit_transform(train_df['review_text'])
X

<1600x36174 sparse matrix of type '<class 'numpy.int64'>'
	with 531505 stored elements in Compressed Sparse Row format>

Let's look at the contenxtual help to print out what X is.

**Question:** What do we think *(0, 8284) <tab>  2* in *X* means?
    
<details>
<summary>Solution</summary>
    The value of the 8284 feature for the 0-th example. In our setting this means how many times that word appeared in the first document

        
</details>

In [40]:
train_df['review_text'].iloc[0]

'you know something , christmas is not about presents . \nit\'s about over-hyped holiday films with lots of merchandising and product tie-ins . \nat least that would seem to be the message of " the grinch , " which has been advertised since last christmas and whose logo is currently plastered all over stores . \nhollywood expects us to ignore this cynical greed as the movie scolds us about losing the true spirit of the season . \nyou know the plot : there\'s this evil furry green guy called the grinch ( jim carrey ) who lives on a mountain overlooking whoville . \ndown below all the whos are preparing for their whobilation , but the grinch is determined to steal their christmas . \nthe movie is , of course , a live-action version of the beloved children\'s book , which was previously adapted into a 1966 tv special by looney tunes animator chuck jones . \nit\'s rare that a big budget hollywood release is shamed by a thirty-year-old half-hour cartoon , but that\'s the case when jones\' v

(0, 13536)	3

In [56]:
vectorizer.get_feature_names()[36008]

'you'

In [57]:
train_df['review_text'].iloc[0].count('you')

4

In [50]:
X

<1600x13578 sparse matrix of type '<class 'numpy.int64'>'
	with 498014 stored elements in Compressed Sparse Row format>

In [51]:
vectorizer.get_feature_names()[6826]

'know'

In [52]:
train_df['review_text'].iloc[0].count('know')

3

In [None]:
vectorizer.get_feature_names()[8284]

In [None]:
train_df['review_text'].iloc[0].count('notoriety')

#### Train the model

In [58]:
nb_model.fit(X, train_df['gold-label'])

MultinomialNB()

#### How well does the model perform on the training set?

In [59]:
nb_model.score(X, train_df['gold-label'])

0.975625

If the model just guessed True or False:

In [60]:
train_df['gold-label'].value_counts(normalize=True)

False    0.501875
True     0.498125
Name: gold-label, dtype: float64

#### Evaluate the model on dev

In [61]:
nb_model.predict(dev_df['review_text'])

ValueError: could not convert string to float: 'after seeing blaze and driving miss daisy , i was ready for some mindless fun--oh , maybe something like tango & cash . \nmaybe not ! \nmindless fun is one thing , but brain-dead slop is another matter altogether . \ntango & cash has " lowest common denominator " written all over it . \nthe movie stars sylvester stallone and kurt russell as rival undercover cops in l . a . russell plays cash , a reckless slob who dresses in jeans and tee shirts . \nstallone plays tango , a wealthy investor who works on the force strictly for the thrill of it ; he doesn\'t need the money . \nin an effort to change his image , stallone goes for a yuppie , gq look in the film , wearing spectacles and three piece suits . \nthe two detectives reluctantly become partners after they are framed for murder and have to break out of prison to clear their names . \ntango & cash is unbearably noisy . \nfor starters , there\'s harold faltermeyer\'s annoying synthesized score , which gets old after four notes . \nto make matters worse , the filmmakers seem to think that when it comes to loud explosions and screeching cars , the more the merrier . \nin fact , the movie begins with not one but two car chases . \nthere\'s nothing like a good old car chase to introduce the characters in a movie . \nscreenwriter randy feldman\'s brain must have gone to mush from watching too many cop shows on tv . \nhis shockingly stupid screenplay undermines anything and everything the movie has going for it , such as stallone\'s efforts to convince you that tango is an intellectual . \nin a movie with the iq level of an amoeba , even a great actor is going to have trouble looking intelligent--and stallone is no lawrence olivier . \nit\'s hard to imagine anyone reading feldman\'s script and thinking , " i want to be in this movie . " \nthe film\'s plot doesn\'t have one original bone in its body , and--again--you have to point your finger at the screenwriting . \nfeldman\'s story line succumbs to every crime thriller cliche in the book , making tango & cash altogether generic and predictable . \nthey simply could have called it " action movie . " \nevery character , every twist and turn , is stolen from television or from other movies . \nadding insult to injury , tango & cash is about as believable as a " road runner " cartoon . \naction movies don\'t have to be realistic , but they should absorb you enough so that you\'re not thinking about the lack of realism . \nthe only artistic aspect of tango & cash is the cinematography . \nthere are some spectacular shots , especially during the rainy nighttime prison break in which tango and cash slide to safety on electrical wires . \nthe film\'s main draw is the chemistry between stallone and russell . \nunfortunately , their relationship rarely progresses past macho competition as they endlessly bicker about who packs more meat in his pants . \nthe dialogue consists of nothing but one-liners , and consequently the attempts at character development are embarrassing . \nit\'s a shame because with a workable screenplay , russell and stallone could have turned tango & cash into a charming " lethal weaponesque " adventure . \ntango & cash tries to maintain a light tone , and you do laugh about once every five minutes . \nseeing russell in drag is the movie\'s funniest moment , but you probably already have seen it in the commercials . \nfurthermore , the light tone does not sit well against the relentlessly brutal violence . \nmachine guns and torture generally don\'t mix well with comedy . \njack palance appears in tango & cash doing what he does best : playing a sleazy , conniving villain . \nhis character , however , is run-of-the-mill , except for his strange obsession with mice . \nlike many movie villains , palance likes to play games . \nin fact , he sets a ridiculously elaborate trap for tango and cash , a trap which sends the two detectives to prison so they can be beaten and electrocuted by some meanies in the boiler room . \nyou have to wonder why palance doesn\'t just shoot the detectives in the head ! \nthe story would crumble if any of the characters were to do anything intelligent . \nteri hatcher plays stallone\'s sister and russell\'s love-interest , and she is just as pretty as can be . \nbut regrettably , hatcher\'s acting is not on par with her exceptional beauty . \nevery time she opens her mouth , you cringe ; corny dialogue and atrocious acting are always a fatal combination . \nthere\'s really very little , if anything , to recommend in the film . \nand , more to the point , tango definitely isn\'t worth a penny of your cash--so don\'t bother . \n'

In [None]:
vectorizer.fit_transform

In [62]:
X_dev = vectorizer.transform(dev_df['review_text'])
X_dev

<200x36174 sparse matrix of type '<class 'numpy.int64'>'
	with 65181 stored elements in Compressed Sparse Row format>

In [65]:
sum(X_dev.toarray()[0])

708

In [70]:
nb_model.predict(X_dev)

array([False,  True, False,  True,  True,  True,  True,  True,  True,
       False, False,  True, False,  True, False,  True, False, False,
       False, False,  True,  True, False,  True,  True,  True,  True,
       False, False, False, False, False, False,  True, False, False,
        True,  True, False,  True,  True,  True,  True, False, False,
       False, False,  True, False, False, False, False,  True, False,
        True,  True, False,  True, False, False,  True, False,  True,
        True,  True,  True, False, False, False,  True,  True,  True,
        True, False, False,  True, False,  True, False, False,  True,
        True, False, False, False,  True,  True, False,  True,  True,
       False, False,  True, False,  True,  True,  True,  True, False,
       False, False,  True, False,  True,  True, False,  True,  True,
        True, False,  True, False, False,  True, False, False, False,
        True, False, False, False, False, False, False, False,  True,
        True, False,

**Let's store the predictions**

In [71]:
dev_df = dev_df.assign(nb_predictions = nb_model.predict(X_dev))
dev_df.keys()

Index(['file_name', 'gold-label', 'review_text', 'nb_predictions'], dtype='object')

**Let's compute accuracy**

In [72]:
dev_df['nb_predictions'] == dev_df['gold-label']

781      True
1457     True
180      True
1262     True
1249     True
        ...  
105      True
759      True
1070     True
279      True
223     False
Length: 200, dtype: bool

In [73]:
sum(dev_df['nb_predictions'] == dev_df['gold-label']) / dev_df.shape[0]

0.805

In [74]:
nb_model.score(X_dev, dev_df['gold-label'])

0.805

#### Interpreting the model

In computational text analysis, we don't necessarily care just about building a classifier that does well. We want to use the classifier to gain insight about our text.

**Question:** From the Naive Bayes equation, what do you think can give us the most insight about our text?

In [None]:
nb_model.feature_log_prob_.shape

In [None]:
nb_model.feature_log_prob_[0]

In [None]:
nb_model.feature_log_prob_[0].argmax()

In [None]:
vectorizer.get_feature_names()[nb_model.feature_log_prob_[0].argmax()]

In [None]:
nb_model.feature_log_prob_[0].max(), nb_model.feature_log_prob_[0][nb_model.feature_log_prob_[0].argmax()]

In [None]:
# code from https://stackoverflow.com/questions/50526898/how-to-get-feature-importance-in-naive-bayes

neg_class_prob_sorted = nb_model.feature_log_prob_[0, :].argsort()[::-1]
pos_class_prob_sorted = nb_model.feature_log_prob_[1, :].argsort()[::-1]

print(np.take(vectorizer.get_feature_names(), neg_class_prob_sorted[:10]))
print(np.take(vectorizer.get_feature_names(), pos_class_prob_sorted[:10]))

The above is telling us what is the probability of a word given a label.

In [None]:
neg_class_prob_sorted = nb_model.feature_log_prob_[0, :].argsort()[::-1]
pos_class_prob_sorted = nb_model.feature_log_prob_[1, :].argsort()[::-1]

top_100_negative = set(np.take(vectorizer.get_feature_names(), neg_class_prob_sorted[:200]))
top_100_positive = set(np.take(vectorizer.get_feature_names(), pos_class_prob_sorted[:200]))

In [None]:
top_100_negative - top_100_positive

In [None]:
top_100_positive - top_100_negative

#### Fine-tuning hyper-parameters

In [None]:
nb_model.alpha = 5
nb_model.alpha

In [None]:
nb_model = MultinomialNB(alpha=5)

In [None]:
X_train

In [None]:
nb_model.fit(X, train_df['gold-label'])

In [None]:
nb_model.score(X, train_df['gold-label'])

In [None]:
nb_model.predict(X_dev)

In [None]:
nb_model.score(X_dev, dev_df['gold-label'])

**How does this compare to the result on dev before?**

This was an example of hyper-parameter tuning

(back to slides)
### Logistic Regression

In [75]:
lr_model = sklearn.linear_model.LogisticRegression(max_iter = 1e4)
lr_model

LogisticRegression(max_iter=10000.0)

In [None]:
lr_model.fit()

In [76]:
lr_model.fit(df['review_text'], df['gold-label'])

ValueError: could not convert string to float: 'you know something , christmas is not about presents . \nit\'s about over-hyped holiday films with lots of merchandising and product tie-ins . \nat least that would seem to be the message of " the grinch , " which has been advertised since last christmas and whose logo is currently plastered all over stores . \nhollywood expects us to ignore this cynical greed as the movie scolds us about losing the true spirit of the season . \nyou know the plot : there\'s this evil furry green guy called the grinch ( jim carrey ) who lives on a mountain overlooking whoville . \ndown below all the whos are preparing for their whobilation , but the grinch is determined to steal their christmas . \nthe movie is , of course , a live-action version of the beloved children\'s book , which was previously adapted into a 1966 tv special by looney tunes animator chuck jones . \nit\'s rare that a big budget hollywood release is shamed by a thirty-year-old half-hour cartoon , but that\'s the case when jones\' version is compared to ron howard\'s . \nthe tv grinch hit all the right notes : boris karloff\'s soft , deep narration ; thurl ravenscroft singing " mr . grinch " ; max the dog weighed down by the gigantic antler tied to his head ; and the grinch\'s wide , toothless grin . \nby contrast , the movie hits one sour note after another . \nfirst , there are the numerous bad choices that jeffrey price and peter seaman made in padding out the short book into a 105-minute movie . \nyoung wide-eyed cindy lou who ( taylor momsen ) is depressed about the misplaced priorities of her parents ( bill irwin and molly shannon ) during the holiday season . \nshe begins to sympathize with the grinch , who turns out to be surprisingly sympathetic . \ncindy lou discovers that the grinch turned tearfully away from whoville in grammar school when he was publicly humiliated while expressing his love for the prettiest girl in the class , martha may whovier ( played as an adult by christine baranski ) . \nare we expected to like the grinch , hate the whos , and want him to steal christmas ? \nthe grinch isn\'t even the villain here ; that role is filled by the corrupt mayor of whoville ( jeffrey tambor ) who was the grinch\'s rival for martha\'s affection . \nnot only are the characters needlessly complex , but the once-simple plot becomes so convoluted that the actual theft of christmas seems like an afterthought . \nthe casting choices aren\'t any better than the screenwriting decisions . \njim carrey seems woefully miscast . \nwhile his face is so supple that the rubber make-up seems superfluous , carrey brings nothing else to the role . \nhis accent keeps changing ; i assume he was shooting for karloff , but he ends up sounding like a weird slurry of richard nixon , sean connery , and cartman from " south park . " \nnot knowing what else to do during his many scenes alone in the grinch\'s home , carrey falls back on his stand-up comedy and clowns around ace ventura-style . \nneedless to say , his wise-cracking antics don\'t quite fit the character of the grinch , who is an embittered loner filled with hate . \nlittle taylor momsen brings little to role of cindy lou except big eyes and a cute smile . \nshe would have been fine if cindy lou was limited to her original purpose in the seuss story ( finding " santa " in her living room stealing the tree ) . \nhowever , the expanded script makes cindy as important as the grinch , and momsen is not up to the challenge . \nbottom line : when you\'re stealing christmas ( movies ) , leave this one behind . \n'

**Question:** What happened?


<details>
<summary>Solution</summary>
We need to convert the text to features
</details>


In [None]:
df['review_text']

In [77]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=4)

In [78]:
X = vectorizer.fit_transform(df['review_text'])
X

<2000x15452 sparse matrix of type '<class 'numpy.int64'>'
	with 630855 stored elements in Compressed Sparse Row format>

**Question:** What is 2000 and what is 15452?

In [79]:
lr_model.fit(X, df['gold-label'])

LogisticRegression(max_iter=10000.0)

In [80]:
df['prediction'] = lr_model.predict(X)
df['prediction']

918     False
411     False
765     False
208     False
1726     True
        ...  
1216     True
209     False
38      False
184     False
648     False
Name: prediction, Length: 2000, dtype: bool

In [81]:
sum(df['prediction'] == df['gold-label'])

2000

#### Finding coefficients

In [82]:
lr_model.coef_

array([[ 0.0146596 , -0.02227432,  0.03103228, ...,  0.16988816,
        -0.0455234 , -0.032439  ]])

In [83]:
lr_model.coef_.shape

(1, 15452)

In [None]:
len(vectorizer.get_feature_names())

In [84]:
lr_model.coef_.argmax()

5673

In [86]:
lr_model.coef_[0][lr_model.coef_.argmax()]

0.5842190320719329

In [88]:
vectorizer.get_feature_names()[lr_model.coef_.argmax()]

'fun'

In [89]:
lr_model.coef_.argmin()

1134

In [90]:
lr_model.coef_[0][lr_model.coef_.argmin()]

-0.8156166518651667

In [91]:
vectorizer.get_feature_names()[lr_model.coef_.argmin()]

'bad'

#### Prediction

In [92]:
df

Unnamed: 0,file_name,gold-label,review_text,prediction
918,neg/cv918_27080.txt,False,"you know something , christmas is not about pr...",False
411,neg/cv411_16799.txt,False,"confucius once said , "" governing a nation is ...",False
765,neg/cv765_20429.txt,False,it seems that i've stopped enjoying movies tha...,False
208,neg/cv208_9475.txt,False,"one would think that david duchovny , star of ...",False
1726,pos/cv726_4719.txt,True,copyright 1996 graeme huggan carry on henry is...,True
...,...,...,...,...
1216,pos/cv216_18738.txt,True,bruce barth's mellow piano plays in the backgr...,True
209,neg/cv209_28973.txt,False,""" i would appreciate it if you didn't do that...",False
38,neg/cv038_9781.txt,False,"capsule : where are you tonight , leni rienfen...",False
184,neg/cv184_26935.txt,False,it's always a bad sign when the core audience ...,False


In [93]:
df.sample(df.shape[0])

Unnamed: 0,file_name,gold-label,review_text,prediction
164,neg/cv164_23451.txt,False,ever watch a very young child try and tell a j...,False
1760,pos/cv760_8597.txt,True,"the "" italian hitchcock "" and acknowledged mas...",True
439,neg/cv439_17633.txt,False,you know the plot : a dimwit with a shady past...,False
1816,pos/cv816_13655.txt,True,"in my review of there's something about mary ,...",True
1914,pos/cv914_28742.txt,True,"susan granger's review of "" legally blonde "" (...",True
...,...,...,...,...
301,neg/cv301_13010.txt,False,carry on at your convenience is all about the ...,False
1938,pos/cv938_10220.txt,True,the idea at the center of the devil's advocate...,True
1154,pos/cv154_9328.txt,True,""" i've been told by several people , 'you're ...",True
1056,pos/cv056_13133.txt,True,the disney studios has its formula for annual ...,True


In [94]:
df = df.sample(df.shape[0])
train_df = df.head(1800)
test_df = df.tail(200)

train_df.shape, test_df.shape

((1800, 4), (200, 4))

In [102]:
vectorizer = CountVectorizer()
vectorizer.fit_transform(train_df['review_text'])

<1800x37980 sparse matrix of type '<class 'numpy.int64'>'
	with 600830 stored elements in Compressed Sparse Row format>

*In class there was a bug here because I had `vectorizer.fit_transform(train_df)`*

In [103]:
X_train = vectorizer.fit_transform(train_df['review_text'])
X_train

<1800x37980 sparse matrix of type '<class 'numpy.int64'>'
	with 600830 stored elements in Compressed Sparse Row format>

In [104]:
lr_model = sklearn.linear_model.LogisticRegression(max_iter = 1e4)
lr_model

LogisticRegression(max_iter=10000.0)

In [105]:
lr_model.fit(X_train, train_df['gold-label'])

LogisticRegression(max_iter=10000.0)

Let's see how well the model performed by saving the predictions

In [107]:
lr_model.score(X_train, train_df['gold-label'])

1.0

The model perfectly fit the training data. Now let's see how well the model performs on the test set.

In [109]:
lr_model.score(vectorizer.transform(test_df['review_text']), test_df['gold-label'])

0.855

We can see the model achieved an 85.5% accuracy on the held out test examples.
When a model performs much better on the training set compared to the held out set, we refer to this as ***overfitting***. Here is a good, [brief post from Investopia](https://www.investopedia.com/terms/o/overfitting.asp) that explained overfitting.

#### n-grams

Look at ngram_range in https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

(back to slides)
### More classifiers

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(5, 2), random_state=1)
clf = SVC(gamma=2, C=1)
clf = DecisionTreeClassifier(max_depth=5)

In [None]:
clf.fit()
clf.predict()