# Project 3

## In this project, reviews from two types of products are classified. Reviews of movies and books downloaded from Reddit.com. The two groups of reviews were combined in one dataframe

In [124]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
import regex as re
from nltk.corpus import stopwords # Import the stop word list
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

## Datasets were downloaded in "get_data.ipynb" and were saved in "my_books.csv" and "my_movies.csv"

In [125]:
df_books = pd.read_csv('my_books.csv')

In [126]:
df_movies = pd.read_csv('my_movies.csv')

In [127]:
df_movies.head(20)

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,I cant seem to remeber this movie,theres this movie I watched as a child but I c...,movies,1568141984,Trystan_03,0,1,True,9/10/19
1,Dunkin' Donuts in 80s movies,"Random question I know, but does anybody every...",movies,1568142543,DoctorWernerKlopek,10,2,True,9/10/19
2,IT Chapter 2 - Ending Question [Spoilers],I recently saw IT Chp 2 and I was confused by ...,movies,1568143828,GregHauser,33,20,True,9/10/19
3,It's PG now and he wouldn't have to lose all t...,Imagine if Joaquin Phoenix were in a movie cal...,movies,1568143837,BlkSunshineRdriguez,0,1,True,9/10/19
4,"Is ""The Man Who Killed Hitler and Then the Big...","Hi Everyone, I just watched ""The Man Who Kille...",movies,1568144143,TheFirstHunter,18,3,True,9/10/19
5,He wouldn't have to lose all that weight,[removed],movies,1568144475,BlkSunshineRdriguez,0,1,True,9/10/19
6,Todd Phillips Shuts Down Rumor ‘Joker’ Sequel ...,[removed],movies,1568144488,Goanimest,1,1,True,9/10/19
7,How scary is pet semetary 2019?,,movies,1568144581,Legit_king_yolo,0,1,True,9/10/19
8,What is your favorite on screen kiss?,On screen chemistry can make or break a movie....,movies,1568145491,bothanspied,73,29,True,9/10/19
9,Sites to follow for indie movie news?,What sites/twitters/pages/whatever do y’all fo...,movies,1568145845,NoTakaru,1,4,True,9/10/19


In [5]:
df = pd.concat([df_books, df_movies])

In [6]:
df.shape

(32270, 9)

## Performing EDA

In [7]:
## Check for missing data

df.isna().sum()

title             11
selftext        3770
subreddit         18
created_utc       21
author            22
num_comments      22
score             22
is_self           23
timestamp         26
dtype: int64

In [8]:
## Drop rows with missing data

df.dropna(inplace = True)

In [9]:
df.isna().sum()

title           0
selftext        0
subreddit       0
created_utc     0
author          0
num_comments    0
score           0
is_self         0
timestamp       0
dtype: int64

In [10]:
## Remove lines with removed and deleted contents

df = df[(df['selftext'] != '[removed]') & ((df['selftext'] !=  '[deleted]') )]

In [11]:
df['subreddit'].replace('books', '0', inplace = True)
df['subreddit'].replace('movies', '1', inplace = True)
df['subreddit'].value_counts()

1    9227
0    7268
Name: subreddit, dtype: int64

In [12]:
df.reset_index(inplace = True)

In [13]:
# sorting by first review 
df.sort_values("selftext", inplace=True) 
  
# dropping duplicate values 
df.drop_duplicates(subset ="selftext", 
                     keep = False, inplace = True)
df.shape

(16261, 10)

In [14]:
# Set X and y
X_train, X_test, y_train, y_test = train_test_split(df[['selftext']],
                                                    df['subreddit'],
                                                    test_size = 0.25,
                                                    random_state = 42)
X_train.shape

(12195, 1)

In [96]:
## Create a function to transform all reviews to stirng of words

def review_to_words(raw_review):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove HTML.
    review_text = BeautifulSoup(raw_review).get_text()
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 4. In Python, searching a set is much faster than searching
    # a list, so convert the stop words to a set.
    stops = set(stopwords.words('english'))
    
    # 5. Remove stop words.
    meaningful_words = [w for w in words if not w in stops]
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

In [97]:
# Get the number of reviews based on the dataframe size.
total_reviews = df.shape[0]
print(f'There are {total_reviews} reviews.')

# Initialize an empty list to hold the clean reviews.
clean_train = []
clean_test = []

There are 16261 reviews.


In [98]:
print("Cleaning and parsing the training set movie & book reviews...")

j = 0
for train in X_train['selftext']:
    # Convert review to words, then append to clean_train.
    clean_train.append(review_to_words(train))

    
    j += 1


# Let's do the same for our testing set.

print("Cleaning and parsing the testing set movie & book reviews...")

for test in X_test['selftext']:
    # Convert review to words, then append to clean_train.
    clean_test.append(review_to_words(test))

    
    j += 1


Cleaning and parsing the training set movie & book reviews...
Cleaning and parsing the testing set movie & book reviews...


## CountVectorizing

In [99]:
from nltk.tokenize import RegexpTokenizer
import regex as re
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [100]:
# Import CountVectorizer.
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features =100) 

In [101]:
# Fit_transform the data

train_data_features = vectorizer.fit_transform(clean_train)

test_data_features = vectorizer.transform(clean_test)

# convert the data into array


train_data_features = train_data_features.toarray()
test_data_features = test_data_features.toarray()
train_data_features.shape

(12195, 100)

### Start classification!

In [102]:
# Import logistic regression.

from sklearn.linear_model import LogisticRegression

In [103]:
# Instantiate logistic regression model.

lr = LogisticRegression()

In [104]:
train_data_features.shape

(12195, 100)

In [105]:
lr.fit(train_data_features, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [106]:
# Evaluate model on training data.
lr.score(train_data_features, y_train)

0.9425174251742517

In [107]:
# Evaluate model on testing data.

lr.score(test_data_features, y_test)

0.9313821938022626

In [108]:
pred_LR = lr.predict(test_data_features)
pred_LR = pd.DataFrame(pred_LR)
pred_LR[0].value_counts(normalize = True)

1    0.590507
0    0.409493
Name: 0, dtype: float64

## Perform GridSearch to determine te best Hyperparameters

In [165]:
lr.coef_.mean()

0.0003640058615373581

In [109]:
X_train['selftext']

13755                                      I'm listening. 
9317     I was watching Beauty and The Beast the other ...
13247    If so, can you tell me if it's worth seeing to...
4345     I started reading War and Peace (Maude transla...
3827     I got myself some page markers since I want to...
                               ...                        
30       The Far Pavillions was a bit too expensive for...
1944     I have to read this for my AP Language and Com...
9762     After the final scene, if the movie were to co...
4600     for procedural single book, I guess it works o...
15713    I think I saw it on IFC or the Sundance channe...
Name: selftext, Length: 12195, dtype: object

In [110]:
# Let's set it up with two stages:
# 1. An instance of CountVectorizer (transformer)
# 2. A LogisticRegression instance (estimator)

pipe = Pipeline([
    ('feats', FeatureUnion([
         ('tfidf', TfidfVectorizer()),
         ('cvec', CountVectorizer()),
    ])),
    ('nb', MultinomialNB())
])

In [111]:
# Search over the following values of hyperparameters:

pipe_params = {
    'feats__cvec__max_features': [5000],
    'feats__cvec__max_df': [0.6,0.8],
    'feats__cvec__ngram_range': [(1,1)],
    'feats__cvec__stop_words' :['english', None],
    'feats__tfidf__max_features': [100],
    'feats__tfidf__max_df': [0.8,.9],
    'feats__tfidf__ngram_range': [(1,2)],
    'feats__tfidf__stop_words' :['english', None],
}

In [112]:
# Instantiate GridSearchCV.

gs = GridSearchCV(pipe, # what object are we optimizing?
                  param_grid=pipe_params, # what parameters values are we searching?
                  cv=3,
                  n_jobs=-1,
                  verbose=2) # 3-fold cross-validation.

In [113]:
# Fit GridSearch to training data.
gs.fit(X_train['selftext'], y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  1.5min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('feats',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('tfidf',
                                                                        TfidfVectorizer(analyzer='word',
                                                                                        binary=False,
                                                                                        decode_error='strict',
                                                                                        dtype=<class 'numpy.float64'>,
                                                                                        encoding='utf-8',
                                                                                        input='content',
                                                           

In [114]:
gs.best_params_

{'feats__cvec__max_df': 0.6,
 'feats__cvec__max_features': 5000,
 'feats__cvec__ngram_range': (1, 1),
 'feats__cvec__stop_words': 'english',
 'feats__tfidf__max_df': 0.9,
 'feats__tfidf__max_features': 100,
 'feats__tfidf__ngram_range': (1, 2),
 'feats__tfidf__stop_words': None}

In [115]:
# What's the best score?
print(gs.best_score_)

0.9553915539155392


In [116]:
# Save best model as gs_model.

gs.best_estimator_

Pipeline(memory=None,
         steps=[('feats',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('tfidf',
                                                 TfidfVectorizer(analyzer='word',
                                                                 binary=False,
                                                                 decode_error='strict',
                                                                 dtype=<class 'numpy.float64'>,
                                                                 encoding='utf-8',
                                                                 input='content',
                                                                 lowercase=True,
                                                                 max_df=0.9,
                                                                 max_features=100,
                                                                 min_df=1,
                         

In [117]:
# Evaluate model on training data.
gs.score(X_train['selftext'], y_train)

0.9624436244362443

In [118]:
# Evaluate model on testing data.
gs.score(X_test['selftext'], y_test)

0.9545007378258731

In [119]:
pred = gs.predict(X_test['selftext'])
pred = pd.DataFrame(pred)
pred[0].value_counts(normalize = True)

1    0.550664
0    0.449336
Name: 0, dtype: float64

In [120]:
### Baseline score

y_test.value_counts(normalize = True)

1    0.557304
0    0.442696
Name: subreddit, dtype: float64