# Demo 17 - Naive Bayes and Logistic Regressions

In [None]:
import nltk
import sklearn
import pandas as pd
import numpy as np

## Multiplying Probabilities

In [None]:
np.random.rand() * np.random.rand() * np.random.rand()

In [None]:
rand_number = np.random.rand() 
for i in range(10):
    rand_number *= np.random.rand() 
rand_number

In [None]:
rand_number = np.random.rand() 
for i in range(100):
    rand_number *= np.random.rand() 
rand_number

In [None]:
rand_number = np.random.rand() 
for i in range(500):
    rand_number *= np.random.rand() 
rand_number

In [None]:
rand_number = np.random.rand() 
for i in range(600):
    rand_number *= np.random.rand() 
rand_number

In [None]:
rand_number = np.random.rand() 
for i in range(1000):
    rand_number *= np.random.rand() 
rand_number

(back to slides)

#### Logs to the rescue

In [None]:
rand_number = np.log(np.random.rand())
for i in range(1000):
    rand_number += np.log(np.random.rand()) 
rand_number

## Sklearn

### Movie Reviews

In [None]:
moview_reviews = nltk.corpus.movie_reviews

In [None]:
review_files = [(file_id, file_id.startswith("pos")) for file_id in moview_reviews.fileids()]

len(neg_review), len(pos_review)

In [None]:
df = pd.DataFrame(review_files)
df

In [None]:
df = df.rename(columns={0: "file_name", 1: "gold-label"})
df

In [None]:
def read_mov_review(f_name):
    return moview_reviews.open(f_name).read()

df['review_text'] = df['file_name'].apply(read_mov_review)

In [None]:
df['review_text']

#### Make train/dev/test split

In [None]:
df = df.sample(df.shape[0])
df.head(5)

In [None]:
df.shape[0]

Let's make 80:10:10 split of our data

In [None]:
train_max_idx = int(df.shape[0] * .8)
dev_max_idx = int((df.shape[0] * .1) + train_max_idx)


train_max_idx, dev_max_idx

In [None]:
train_df = df.iloc[:train_max_idx]
dev_df = df.iloc[train_max_idx:dev_max_idx]
test_df = df.iloc[dev_max_idx:]

train_df.shape, dev_df.shape, test_df.shape

We will train models on `train_df`, change different hyper-parameters based on `dev_df`, and then evaluate the model on `test_df` once we are done changing our different hyperparameters.

### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
nb_model = MultinomialNB()
nb_model

In [None]:
nb_model.fit(train_df['review_text'], train_df['gold-label'])

**Question:** What happened?

Solution: We need to convert the text to numbers, actually features

(back to slides)

#### Feature Engineering

We need to convert each document into features. 

**Question:** What should our features be?

*Hint*: What's the most basic approach we've taken so far in this class?

<details>
<summary>Solution</summary>
    Bag of words
</details>

In [None]:
# skip next bunch of empty cells, answer below

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=4)

In [None]:
vectorizer.fit_transform(train_df['review_text'])

**Question:** What do these numbers that were printed out mean?

<details>
<summary>Solution</summary>
Number of examples, size of vocabulary
</details>

In [None]:
X = vectorizer.fit_transform(train_df['review_text'])
X

Let's look at the contenxtual help to print out what X is.

**Question:** What do we think *(0, 8284) <tab>  2* in *X* means?
    
<details>
<summary>Solution</summary>
    The value of the 8284 feature for the 0-th example. In our setting this means how many times that word appeared in the first document

        
</details>

In [None]:
vectorizer.get_feature_names()[8284]

In [None]:
train_df['review_text'].iloc[0].count('notoriety')

#### Train the model

In [None]:
nb_model.fit(X, train_df['gold-label'])

#### How well does the model perform on the training set?

In [None]:
nb_model.score(X, train_df['gold-label'])

If the model just guessed True or False:

In [None]:
train_df['gold-label'].value_counts(normalize=True)

#### Evaluate the model on dev

In [None]:
nb_model.predict(dev_df['review_text'])

In [None]:
X_dev = vectorizer.transform(dev_df['review_text'])
X_dev

In [None]:
nb_model.predict(X_dev)

**Let's store the predictions**

In [None]:
dev_df = dev_df.assign(nb_predictions = nb_model.predict(X_dev))
dev_df.keys()

**Let's compute accuracy**

In [None]:
dev_df['nb_predictions'] == dev_df['gold-label']

In [None]:
sum(dev_df['nb_predictions'] == dev_df['gold-label']) / dev_df.shape[0]

#### Interpreting the model

In computational text analysis, we don't necessarily care just about building a classifier that does well. We want to use the classifier to gain insight about our text.

**Question:** From the Naive Bayes equation, what do you think can give us the most insight about our text?

In [None]:
nb_model.feature_log_prob_.shape

In [None]:
nb_model.feature_log_prob_[0]

In [None]:
nb_model.feature_log_prob_[0].argmax()

In [None]:
vectorizer.get_feature_names()[nb_model.feature_log_prob_[0].argmax()]

In [None]:
nb_model.feature_log_prob_[0].max(), nb_model.feature_log_prob_[0][nb_model.feature_log_prob_[0].argmax()]

In [None]:
# code from https://stackoverflow.com/questions/50526898/how-to-get-feature-importance-in-naive-bayes

neg_class_prob_sorted = nb_model.feature_log_prob_[0, :].argsort()[::-1]
pos_class_prob_sorted = nb_model.feature_log_prob_[1, :].argsort()[::-1]

print(np.take(vectorizer.get_feature_names(), neg_class_prob_sorted[:10]))
print(np.take(vectorizer.get_feature_names(), pos_class_prob_sorted[:10]))

The above is telling us what is the probability of a word given a label.

In [None]:
neg_class_prob_sorted = nb_model.feature_log_prob_[0, :].argsort()[::-1]
pos_class_prob_sorted = nb_model.feature_log_prob_[1, :].argsort()[::-1]

top_100_negative = set(np.take(vectorizer.get_feature_names(), neg_class_prob_sorted[:200]))
top_100_positive = set(np.take(vectorizer.get_feature_names(), pos_class_prob_sorted[:200]))

In [None]:
top_100_negative - top_100_positive

In [None]:
top_100_positive - top_100_negative

#### Fine-tuning hyper-parameters

In [None]:
nb_model.alpha = 5
nb_model.alpha

In [None]:
nb_model = MultinomialNB(alpha=5)

In [None]:
X_train

In [None]:
nb_model.fit(X, train_df['gold-label'])

In [None]:
nb_model.score(X, train_df['gold-label'])

In [None]:
nb_model.predict(X_dev)

In [None]:
nb_model.score(X_dev, dev_df['gold-label'])

**How does this compare to the result on dev before?**

This was an example of hyper-parameter tuning

(back to slides)
### Logistic Regression

In [None]:
lr_model = sklearn.linear_model.LogisticRegression(max_iter = 1e4)
lr_model

In [None]:
lr_model.fit()

In [None]:
lr_model.fit(df['review_text'], df['gold-label'])

**Question:** What happened?


<details>
<summary>Solution</summary>
We need to convert the text to features
</details>


In [None]:
df['review_text']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=4)

In [None]:
X = vectorizer.fit_transform(df['review_text'])
X

**Question:** What is 2000 and what is 15452?

In [None]:
lr_model.fit(X, df['gold-label'])

In [None]:
df['prediction'] = lr_model.predict(X)
df['prediction']

In [None]:
sum(df['prediction'] == df['gold-label'])

#### Finding coefficients

In [None]:
lr_model.coef_

In [None]:
lr_model.coef_.shape

In [None]:
len(vectorizer.get_feature_names())

In [None]:
lr_model.coef_.argmax()

In [None]:
lr_model.coef_[0][lr_model.coef_.argmax()]

In [None]:
vectorizer.get_feature_names()[lr_model.coef_.argmax()]

In [None]:
lr_model.coef_.argmin()

In [None]:
lr_model.coef_[0][lr_model.coef_.argmin()]

In [None]:
vectorizer.get_feature_names()[lr_model.coef_.argmin()]

#### Prediction

In [None]:
df

In [None]:
df.sample(df.shape[0])

In [None]:
df = df.sample(df.shape[0])
train_df = df.head(1800)
test_df = df.tail(200)

train_df.shape, test_df.shape

In [None]:
vectorizer = CountVectorizer(min_df=4)
vectorizer.fit(train_df['review_text'])

In [None]:
X_train = vectorizer.transform(train_df)
X_train

In [None]:
lr_model = sklearn.linear_model.LogisticRegression(max_iter = 1e4)
lr_model

In [None]:
lr_model.fit(X_train, train_df['gold-label'])

#### n-grams

Look at ngram_range in https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

(back to slides)
### More classifiers

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(5, 2), random_state=1)
clf = SVC(gamma=2, C=1)
clf = DecisionTreeClassifier(max_depth=5)