In [21]:
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import csv

# Multinomial Naive Bayes and Logistic Regression using Scikit-learn
- MultinomialNB works with occurrence counts and is useful for discrete data.
https://medium.com/@awantikdas/a-comprehensive-naive-bayes-tutorial-using-scikit-learn-f6b71ae84431

- Logistic regression is a linear model for classification (rather than regression). It's also known as maximum-entropy classification (MaxEnt) or log-linear classifier. In this model, the probabilities describing the possible outcomes of a single trial are modeled using a logistic function.
http://www.dataschool.io/guide-to-logistic-regression/

- Compare https://github.com/justmarkham/DAT8/blob/master/other/model_comparison.md

# Load your data

In [4]:
PROC_DIR = "../data/Processed/"
TRAIN = "train.csv"
DEV = "dev.csv"

In [5]:
fields = ['label','text']
tweets_train = pd.read_table(PROC_DIR + TRAIN, sep=",", usecols=fields)
tweets_train.head()
tweets_train.shape

(6000, 2)

In [6]:

tweets_dev = pd.read_table(PROC_DIR + DEV, sep=",", usecols=fields)
tweets_dev.head()
tweets_dev.shape

(1999, 2)

# Organize your data

Lest you not forget... you can add features that are not in your data set. 

In [7]:
# convert label to a numerical variable

#tweets_train.label.value_counts()

tweets_train['label'] = tweets_train['label'].map({'positive':0, 'neutral':1, 'negative':2})
tweets_train.head()
X_train = tweets_train['text']
y_train = tweets_train['label']

tweets_dev['label'] = tweets_dev['label'].map({'positive':0, 'neutral':1, 'negative':2})
tweets_dev.head()
X_dev = tweets_dev['text']
y_dev = tweets_dev['label']

print(X_train.shape) # X is one dimensional
print(y_train.shape)
print(X_dev.shape)
print(y_dev.shape)

(6000,)
(6000,)
(1999,)
(1999,)


# Vectorize training data

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

CountVectorizer converts a collection of text documents to a matrix of token counts

You can also set your own analyzer, tokenizer, and pre-processor
https://towardsdatascience.com/hacking-scikit-learns-vectorizers-9ef26a7170af
For example,
> CountVectorizer(min_df=1, tokenizer=nltk.word_tokenize)

CountVector creates a matrix of token counts.
It will turn the 1-dim X_train into a 2-dim matrix

In [8]:
# Instantiate
vectorizer = CountVectorizer(lowercase=False)


You need to make sure you have split before vectorizing
In this notebook, we'll assume the split is train/dev, though you
may want to combine and make an 80-20 split

In [9]:
# learn training data vocabulary, then use it to create a document-term matrix

# fit
vectorizer.fit(X_train)

# transform training data
X_train_dtm = vectorizer.transform(X_train)

# FYI - you can do this in one step!
# X_train_dtm = vect.fit_transform(X_train)

X_train_dtm

<6000x13070 sparse matrix of type '<class 'numpy.int64'>'
	with 71501 stored elements in Compressed Sparse Row format>

In [10]:
# woo-hoo! A bunch of words!
vectorizer.get_feature_names()
print(vectorizer.vocabulary_)



# Vectorize Test Data

In [11]:
# Transform testing data (using fitted vocabulary) into a document-term matrix
# Let's call a spade a spade -- we're using our dev set as test data here
X_test_dtm = vectorizer.transform(X_dev)
X_test_dtm

<1999x13070 sparse matrix of type '<class 'numpy.int64'>'
	with 20001 stored elements in Compressed Sparse Row format>

# Train your model
Model Building!
The multinomial Naive Bayes classifier is suitable for classification with
discrete features (e.g., word counts for text classification).
The multinomial distribution normally requires integer feature counts.

In [12]:
# Given the model imbalance, you could set priors. I'd be inclined to try this.
nb = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Test your model
Now let's test our model on the dev data. Remember you can always combine these data sets and make your own split.

In [13]:
y_prediction_class = nb.predict(X_test_dtm)

# Examine your results

https://khartig.wordpress.com/tag/confusion-matrix-example/

Confusion matrix

[Pos Neu Neg

Neu

Neg]

In [14]:
# Print message text for the false positives (ham incorrectly classified as spam)

X_dev[y_prediction_class > y_dev]

4       18 th anniv princess diana's death still want ...
68      i'm trying decide want son 4 halloween 1 donal...
81      actress died aged 62 day michael jackson june ...
95      icc cud jst sing along 1 st time heard dumb du...
103     mom heard someone say michelle obama tv starte...
                              ...                        
1956    monday funday thing higher snoop dogg nj gover...
1959       fyi blank headline spot dotd sunday snoop dogg
1962    started zion turned stated snoop dogg may smoo...
1965                        may may added snoop dogg xbox
1982    i'm height snoop dogg wowie smoke marijune err...
Name: text, Length: 234, dtype: object

In [15]:
# Print message text for false negatives
X_dev[4]

"18 th anniv princess diana's death still want believe living private island away public michael jackson"

In [16]:
confusion_matrix(y_true=y_dev, y_pred=y_prediction_class)

array([[667, 151,  25],
       [392, 315,  58],
       [101, 222,  68]])

# Generate scores
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
sklearn.metrics.precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, pos_label=1, average=None, warn_for=(‘precision’, ’recall’, ’f-score’)
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

In [17]:
target_names = ['positive', 'neutral', 'negative']
score = metrics.precision_recall_fscore_support(y_true=y_dev, y_pred=y_prediction_class)
print(classification_report(y_true=y_dev, y_pred=y_prediction_class, target_names=target_names))

              precision    recall  f1-score   support

    positive       0.57      0.79      0.67       843
     neutral       0.46      0.41      0.43       765
    negative       0.45      0.17      0.25       391

    accuracy                           0.53      1999
   macro avg       0.49      0.46      0.45      1999
weighted avg       0.51      0.53      0.50      1999



# Logistic Regression

In [22]:
logreg = LogisticRegression()

In [23]:
logreg.fit(X_train_dtm, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
y_prediction_class = logreg.predict(X_test_dtm)

In [26]:
target_names = ['positive', 'neutral', 'negative']
score = metrics.precision_recall_fscore_support(y_true=y_dev, y_pred=y_prediction_class)
print(classification_report(y_true=y_dev, y_pred=y_prediction_class, target_names=target_names))

              precision    recall  f1-score   support

    positive       0.54      0.74      0.63       843
     neutral       0.45      0.38      0.41       765
    negative       0.36      0.17      0.23       391

    accuracy                           0.49      1999
   macro avg       0.45      0.43      0.42      1999
weighted avg       0.47      0.49      0.47      1999

