In [21]:
import nltk
import numpy as np
import sklearn
import matplotlib as plt
import pandas as pd
from sklearn.model_selection import train_test_split

# Introduction to NLP
The “ratings.tsv” dataset is a subset of 8000 movie reviews from a larger dataset of 25,000 movie reviews (http://ai.stanford.edu/ amaas/data/sentiment/). The goal is to build a classification model that predicts whether or not the review is positive or negative based on the words in the review.

1. Import the data. Note that the dataset is tab delimited.

In [7]:
ratings = pd.read_csv("ratings.tsv", sep="\t")

In [8]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   8000 non-null   object
 1   review  7945 non-null   object
dtypes: object(2)
memory usage: 125.1+ KB


In [9]:
ratings.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


2. Check for and remove missing values and blank strings

In [11]:
ratings = ratings.dropna().reset_index()

3. Split the data into a training set and a test set. Use test size=0.33, stratify=y, and random state=801 (where y is the label positive or negative)

In [22]:
x = ratings.review
y = ratings.label

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, stratify = ratings.label, random_state = 801)

4. Vectorize the data using TF-IDF. Be sure that all model development is with the training data (fit the TF-IDF transformer on the training data, then transform to both training and test data).

In [28]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit(x_train)
X_train = tfidf.transform(x_train)
X_test = tfidf.transform(x_test)

5. Build a machine learning classifier. Try out various models including: 
    • Support Vector Classifier 
    • Multilayer Perceptron 
    • Multinomial Naive Bayes 
    • (You are welcome to try other models if you want)

### Multinomial Naive Bayes 

In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score, roc_curve
import warnings

nb = MultinomialNB()
nb.fit(X_train, y_train)
yhat = nb.predict(X_test)

print(accuracy_score(y_test, yhat))
#print(f1_score(y_test, yhat))
#print(precision_score(y_test, yhat))
#print(recall_score(y_test, yhat))

0.8954996186117468


In [38]:
confusion_matrix(y_test, yhat)

array([[1229,   82],
       [ 192, 1119]])

In [40]:
print(classification_report(y_test, yhat))

              precision    recall  f1-score   support

         neg       0.86      0.94      0.90      1311
         pos       0.93      0.85      0.89      1311

    accuracy                           0.90      2622
   macro avg       0.90      0.90      0.90      2622
weighted avg       0.90      0.90      0.90      2622



### Support Vector Classifier

In [46]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn import svm
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel
#Train the model using the training sets
clf.fit(X_train, y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [47]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.92      0.90      0.91      1311
         pos       0.90      0.92      0.91      1311

    accuracy                           0.91      2622
   macro avg       0.91      0.91      0.91      2622
weighted avg       0.91      0.91      0.91      2622



### Multilayer Perceptron

In [49]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(4,32,32),
                    max_iter=10000,
                    activation='tanh',
                    verbose=False)
mlp.fit(X_train,y_train)

MLPClassifier(activation='tanh', hidden_layer_sizes=(4, 32, 32), max_iter=10000)

In [50]:
y_hat = clf.predict(X_test)

In [51]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

         neg       0.92      0.90      0.91      1311
         pos       0.90      0.92      0.91      1311

    accuracy                           0.91      2622
   macro avg       0.91      0.91      0.91      2622
weighted avg       0.91      0.91      0.91      2622



### (You are welcome to try other models if you want)

6. What model performs the best? Experiment with changing the hyper-parameters, changing the vectorizer, adding bi-grams and/or using a voting classifier to increase model accuracy.

7. If possible, report what words are most important for distinguishing between positive and negative reviews?

8. Using Vader sentiment analysis, predict whether or not the movie review is positive or negative. (Use a positive compound score for “positive” and a negative compound score for “negative”).

9. How does the accuracy of the sentiment analysis compare with that of the predictive model?

10. Try doing sentiment analysis with the TextBlob library. How does the accuracy of TextBlob sentiments compare with Vader and the predictive model?

11. Run LDA topic modeling using gensim on the movie reviews. How many topics are there? What are the most common words in each topic?