## Predicting movie reviews: Using Natural Language Processing to build out a predictive model

Goal: to collect review text from good and bad movies in the past and see if it can be used to create an effective classifier using ensemble methods. Use 2016 movie reviews as the test data.

In [3]:
import pandas as pd
import numpy as np
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from bs4 import BeautifulSoup
import requests
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.metrics import accuracy_score

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [4]:
from imdbpie import Imdb
imdb = Imdb(anonymize=True) # to proxy requests

## Create DataFrame and prep it

In [5]:
## calling the top 250 with this handy IMDBpie method top_250

In [6]:
top = imdb.top_250()
df = pd.DataFrame(top)
tTitle_list = [i.encode("ascii") for i in df["tconst"]]
tTitle_list 
df['year'] = [i.encode("ascii") for i in df['year']]
df.head(2)

Unnamed: 0,can_rate,image,num_votes,rating,tconst,title,type,year
0,True,{u'url': u'https://images-na.ssl-images-amazon...,1715732,9.3,tt0111161,The Shawshank Redemption,feature,1994
1,True,{u'url': u'https://images-na.ssl-images-amazon...,1172712,9.2,tt0068646,The Godfather,feature,1972


In [447]:
## Next scraping the bottom 100 from IMDB itself with xpath and cleansing the results with regex

In [9]:
r = requests.get("http://www.imdb.com/chart/bottom").text
aa = Selector(text=r).xpath("//*[contains(@class, 'titleColumn')]//@href").extract()

In [10]:
import re

btitle_list = []
for i in aa:   
    test = re.search('(?<=title/)\w+', i )
    btitle_list.append(test.group(0).encode('ascii'))
    

In [11]:
len(btitle_list)

100

In [20]:
# reviews_bottom = []
# for i in btitle_list:
#     rev =imdb.get_title_reviews(i, max_results=5)
#     reviews_bottom.append(rev)


In [18]:
##here's a function to make this imdb fetch a bit less painful
def get_imdb(titles_list, num_reviews):
    output = []
    for i in titles_list:
        rev =imdb.get_title_reviews(i, max_results=num_reviews)
        output.append(rev)
    return output
    
        

In [15]:
def get_imdb_rating(titles_list):
    output = []
    for i in titles_list:
        output.append(imdb.get_title_by_id(i).rating)
    return output
 

In [21]:
#reviews_top = get_imdb(tTitle_list,5)

In [None]:
#reviews_bottom = get_imdb(btitle_list,5)

In [108]:
##these lists of titles are also in the same format:
print tTitle_list[:3]
print btitle_list[:3]

['tt4458206', 'tt4009460', 'tt0270846']

In [23]:
##cool, so now these are both in the same format: lists of lists of data objects 

reviews_top
reviews_bottom

These are commented out below so i don't run them again.

In [418]:
titles_and_ranks = zip(btitle_list,bottom_ratings)

In [420]:
df = df[['tconst','rating']]

In [421]:
df_bot = pd.DataFrame(titles_and_ranks, columns =('tconst','rating') )

In [422]:
ndf = pd.concat([df,df_bot])

In [423]:
ndf.head()

Unnamed: 0,tconst,rating
0,tt0111161,9.3
1,tt0068646,9.2
2,tt0071562,9.0
3,tt0468569,9.0
4,tt0050083,8.9


In [424]:
ndf['all_reviews'] = reviews_top + reviews_bottom
ndf.head()

Unnamed: 0,tconst,rating,all_reviews
0,tt0111161,9.3,"[<Review: Why do I want to wri>, <Review: \nCa..."
1,tt0068646,9.2,"[<Review: Rather than concentr>, <Review: \nTh..."
2,tt0071562,9.0,"[<Review: \nThis movie is way t>, <Review: \nT..."
3,tt0468569,9.0,"[<Review: We've been subjected>, <Review: Chri..."
4,tt0050083,8.9,"[<Review: \nAn excellent courtr>, <Review: \nT..."


#### let's get this data into a format that vectorizer is down with...


In [425]:
def unravel(col):
    review_str = ''
    for a in col:
        review_str+= a.text
        return review_str
ndf['flat_reviews'] = ndf['all_reviews'].apply(unravel)

In [432]:
for i in ndf['flat_reviews'][:1]:
    print i.split()

[u'Why', u'do', u'I', u'want', u'to', u'write', u'the', u'234th', u'comment', u'on', u'The', u'Shawshank', u'Redemption?', u'I', u'am', u'not', u'sure', u'-', u'almost', u'everything', u'that', u'could', u'be', u'possibly', u'said', u'about', u'it', u'has', u'been', u'said.', u'But', u'like', u'so', u'many', u'other', u'people', u'who', u'wrote', u'comments,', u'I', u'was', u'and', u'am', u'profoundly', u'moved', u'by', u'this', u'simple', u'and', u'eloquent', u'depiction', u'of', u'hope', u'and', u'friendship', u'and', u'redemption.', u'The', u'only', u'other', u'movie', u'I', u'have', u'ever', u'seen', u'that', u'effects', u'me', u'as', u'strongly', u'is', u'To', u'Kill', u'a', u'Mockingbird.', u'Both', u'movies', u'leave', u'me', u'feeling', u'cleaner', u'for', u'having', u'watched', u'them.', u'I', u"didn't", u'intend', u'to', u'see', u'this', u'movie', u'at', u'all:', u'I', u'do', u'not', u'like', u'prison', u'movies', u'and', u'I', u"don't", u'normally', u'watch', u'them.', u'I',

#### ...and stem the base words before we vectorize here's a function to run the porter stemmer on a list containing one string, which our df  column now is let's ascii these up while we're at it to get rid of weird nonsense words


In [479]:

def make_stem(line):
    return_stemmed = ''
    for i in line.split():
        stemmer = PorterStemmer()
        try :
            i.encode("ascii")
            return_stemmed += ' '+ stemmer.stem(i).encode('ascii')
        except UnicodeDecodeError: 
            return_stemmed += ''
        except UnicodeEncodeError:
            return_stemmed += ''
    return return_stemmed



tester = "a listing u'\xe9' of a strings of words"
make_stem(tester)  

' a list of a string of word'

In [480]:
## go ahead and replace flat reviews with stemmed version of self

ndf['flat_reviews'] = ndf['flat_reviews'].apply(make_stem)
ndf['flat_reviews'][:2]

0     Whi do I want to write the 234th comment on T...
1     Rather than concentr on everyth that is great...
Name: flat_reviews, dtype: object

In [562]:
X = ndf['flat_reviews']
y = ndf['rating'].apply(int).values

In [481]:
exp_ndf = ndf[['tconst', 'rating','flat_reviews' ]]

In [500]:
exp_ndf.to_csv("Checkpoint_model_df.csv", index=False)

## Aight let's make a model

In [536]:
##from there, vectorize these words to translate them into useable features with mathematical values on 
## which to regress, here I'm also casting an "all X feats" variable for cross validation, I'll later fit a new one
## for just X_train
from sklearn.feature_extraction.text import TfidfVectorizer
make_featuresCV = TfidfVectorizer(stop_words='english', strip_accents = 'ascii')
make_featuresCV.fit(X)
all_X_feats = make_featuresCV.transform(X)


In [527]:
## let's start by splitting this up

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)


In [585]:
##now i'll refit my make_features instance of tfid
from sklearn.feature_extraction.text import TfidfVectorizer
make_features = TfidfVectorizer(stop_words='english', strip_accents = 'ascii')
make_features.fit(X_train)
X_train_features = make_features.transform(X_train)
X_test_features = make_features.transform(X_test)

In [529]:
##looks good
X_train_features.todense()[0]


matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

To calibrate expectations, let's run a super simple decision tree with some cross val

In [574]:
TreeClf = DecisionTreeClassifier()
s = cross_val_score(TreeClf, all_X_feats, y, cv = 5).mean()
print "Decision Tree Classifier 5xCV score mean ", s

Decision Tree Classifier 5xCV score mean  0.720207806768


In [593]:
TreeClf_a = DecisionTreeClassifier()
TreeClf_a.fit(X_train_features.todense(), y_train)

ypred_tclf = TreeClf_a.predict(X_test_features)
print "DECISION TREEE ACCURACY SCORE", accuracy_score(y_test, ypred_tclf)

DECISION TREEE ACCURACY SCORE 0.671428571429


Hmm, this is a pretty low score. Let's fit out some other models and compare.


In [539]:
##This is a decision tree with bagging, tested across a number of differenct tree schemas

for n_est in [5, 10,20,30,40,50,60,70, 80]:
    BAgg_clf = BaggingClassifier(base_estimator = DTree_clf, n_estimators = n_est)
    print "num of estimators", n_est, "5xCV Score: ", cross_val_score(BAgg_clf, all_X_feats, y, cv = 5).mean()

num of estimators 5 5xCV Score:  0.737610789402
num of estimators 10 5xCV Score:  0.745820724955
num of estimators 20 5xCV Score:  0.765462885571
num of estimators 30 5xCV Score:  0.759278932538
num of estimators 40 5xCV Score:  0.754520030764
num of estimators 50 5xCV Score:  0.760434549043
num of estimators 60 5xCV Score:  0.76293141676
num of estimators 70 5xCV Score:  0.760510558005
num of estimators 80 5xCV Score:  0.765712550655


Thirty looks pretty good, let's test accuracy with 20 trees compared to our test set.  

In [575]:
BAgg_clf_30 = BaggingClassifier(base_estimator = DTree_clf, n_estimators = 20)
BAgg_clf_30.fit(X_train_features, y_train)
y_pred_bclf3 = BAgg_clf_30.predict(X_test_features)
print "BAGGED TREE ACCURACY SCORE", accuracy_score(y_test, y_pred_bclf3)

BAGGED TREE ACCURACY SCORE 0.785714285714


Woah, that seems pretty good...too good to be true? Here's a slightly more sophisticated model with bootstrapped features to compare.

In [542]:
for n_est in [5,10,20,30,40,50,60,70,80]:
    BAgg_clf = BaggingClassifier(base_estimator=DTree_clf, 
                                 n_estimators=n_est, 
                                bootstrap_features=True)
    print 'number of estimators: ',n_est, '5x CV score: ', cross_val_score(BAgg_clf,all_X_feats ,y
                                                                           , cv=5).mean()


number of estimators:  5 5x CV score:  0.754401396056
number of estimators:  10 5x CV score:  0.757385198272
number of estimators:  20 5x CV score:  0.751897721566
number of estimators:  30 5x CV score:  0.748761901425
number of estimators:  40 5x CV score:  0.748729736087
number of estimators:  50 5x CV score:  0.760191690732
number of estimators:  60 5x CV score:  0.757375955743
number of estimators:  70 5x CV score:  0.759956857073
number of estimators:  80 5x CV score:  0.754594821848


In [543]:
BAgg_clf_50 = BaggingClassifier(base_estimator=DTree_clf,  n_estimators=50, bootstrap_features=True)
BAgg_clf_50.fit(X_train_features, y_train) 
y_pred_bclf5 = BAgg_clf_50.predict(X_test_features)
print "BAGGED TREE W/ FEAT BAGGING ACCURACY SCORE", accuracy_score(y_test, y_pred_bclf5)

BAGGED TREE W/ FEAT BAGGING ACCURACY SCORE 0.814285714286


OK, these seem to keep getting better, perhaps a random forest will rule them all

In [544]:
for n_est in [5,10,20,30,40,50,60,70,80]:
    rfClf = RandomForestClassifier(n_estimators=n_est)
    print 'number of estimators: ',n_est, '5x CV score: ', cross_val_score(rfClf,all_X_feats ,y
                                                                           , cv=5).mean()


number of estimators:  5 5x CV score:  0.708611168112
number of estimators:  10 5x CV score:  0.737517513259
number of estimators:  20 5x CV score:  0.73160299498
number of estimators:  30 5x CV score:  0.717466862862
number of estimators:  40 5x CV score:  0.714728354713
number of estimators:  50 5x CV score:  0.711628353545
number of estimators:  60 5x CV score:  0.703216967549
number of estimators:  70 5x CV score:  0.706074110407
number of estimators:  80 5x CV score:  0.708888627518


Meh, not great.

In [547]:
rfClf_10 = RandomForestClassifier(n_estimators=10)

rfClf_10.fit(X_train_features, y_train)
ypred_RF = rfClf_10.predict(X_test_features)

print "RANDOM FOREST ACCURACY SCORE", accuracy_score(y_test, ypred_RF)

RANDOM FOREST ACCURACY SCORE 0.771428571429


Lastly, let's see where an old school linear regression gets us on this one, using pipeline, just for kicks. 

In [25]:

from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import cross_val_score

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0]}
              ]

pl = Pipeline([('vect', TfidfVectorizer()),
                     ('clf', LogisticRegression(random_state=0))])

gs_pl = GridSearchCV(pl, param_grid,
                           scoring='accuracy',
                           cv=2,
                           verbose=1,
                           n_jobs=-1)


report = cross_val_score(gs_pl, X, y, cv = 5)


In [581]:
print "CVx5 report" , report


 CVx5 report [ 0.76712329  0.8         0.74285714  0.75362319  0.75      ]


In [577]:
gs_pl.fit(X_train, y_train)
y_pred_GSPL = gs_pl.predict(X_test)
print "LR PIPELINE ACCURACY SCORE", accuracy_score(y_test, y_pred_GSPL)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.4s finished


LR PIPELINE ACCURACY SCORE 0.842857142857


### ALRIGHT, this all seems well and good, but we're data scientists and want to predict on data from the wild. So let's grab some 2016 movie data and get crazy. 

In [311]:
good_movies_url = "http://www.imdb.com/search/title?year=2016,2016&title_type=feature&sort=user_rating,desc"
good_movies_html = requests.get(good_movies_url)

In [312]:
good_movies_soup = BeautifulSoup(good_movies_html.text, "html.parser")

In [328]:
goodm_divs = good_movies_soup.findAll("div",{"class":"ribbonize"})
goodm_divs[0]

<div class="ribbonize" data-caller="filmosearch" data-tconst="tt5825058"></div>

In [330]:
#dir(goodm_divs[0].text)
goodm_divs[0]["data-tconst"]

u'tt5825058'

In [332]:
m2016_ids = []
for div_elm in goodm_divs:
    m2016_ids.append(div_elm['data-tconst'])

In [1]:
reviews_2016_as_text = []
review_2016_rating_as_int = []

for tconst in m2016_ids:
    try:
        reviews_for_movie_i = imdb.get_title_reviews(tconst, max_results=2)
        if reviews_for_movie_i is None:
            continue
        for movie in reviews_for_movie_i:
            reviews_2016_as_text.append(movie.text)
            review_2016_rating_as_int.append(movie.rating)
    except ConnectionError:
        print 'werent able to get movie', tconst

In [334]:
bad_movies_url = "http://www.imdb.com/search/title?year=2016,2016&title_type=feature&sort=user_rating,asc"
bad_movies_html = requests.get(bad_movies_url)
bad_movies_soup = BeautifulSoup(bad_movies_html.text, "html.parser")

In [337]:
badm_divs = bad_movies_soup.findAll("div",{"class":"ribbonize"})
bm2016_ids = []
for div_elm in badm_divs:
    bm2016_ids.append(div_elm['data-tconst'])

In [339]:
all_2016 = m2016_ids + bm2016_ids

In [553]:
reviews_2016_as_text = []
review_2016_rating_as_int = []

for tconst in all_2016:
    try:
        reviews_for_movie_i = imdb.get_title_reviews(tconst, max_results=2)
        if reviews_for_movie_i is None:
            continue
        for movie in reviews_for_movie_i:
            reviews_2016_as_text.append(movie.text)
            review_2016_rating_as_int.append(movie.rating)
    except ConnectionError:
        print 'werent able to get movie', tconst

In [554]:
print len(reviews_2016_as_text)
print len(review_2016_rating_as_int)

68
68


In [559]:
df_2016_movies = pd.DataFrame({'reviews': reviews_2016_as_text , 'ratings': review_2016_rating_as_int})
df_2016_movies = df_2016_movies.dropna()
df_2016_movies.head(2)

Unnamed: 0,ratings,reviews
0,10.0,"This is my first IMDb review, please excuse my..."
1,10.0,"In Bangladesh movie industry are not helpful, ..."


In [570]:
X_2016 = df_2016_movies['reviews']
y_2016 = df_2016_movies['ratings'].astype(int).values


In [589]:
# make_2016_features = TfidfVectorizer(stop_words='english', strip_accents = 'ascii')
make_2016_features.fit(X_train)
all_X_feats = make_2016_features.transform(X_train)
X_2016_features = make_2016_features.transform(X_2016)

In [599]:

ypred_2016_tclf = TreeClf_a.predict(X_2016_features)
print "DECISION TREEE ACCURACY SCORE", accuracy_score(y_2016, ypred_2016_tclf)

DECISION TREEE ACCURACY SCORE 0.131147540984


In [600]:
y_pred_bclf3 = BAgg_clf_30.predict(X_2016_features)
print "BAGGED TREE ACCURACY SCORE", accuracy_score(y_2016, y_pred_bclf3)

BAGGED TREE ACCURACY SCORE 0.0819672131148


In [601]:
# BAgg_clf_50 = BaggingClassifier(base_estimator=DTree_clf,  n_estimators=50, bootstrap_features=True)
# BAgg_clf_50.fit(X_train_features, y_train) 
y_pred_bclf5_16 = BAgg_clf_50.predict(X_2016_features)
print "BAGGED TREE W/ FEAT BAGGING ACCURACY SCORE", accuracy_score(y_2016, y_pred_bclf5_16)

BAGGED TREE W/ FEAT BAGGING ACCURACY SCORE 0.0819672131148


In [602]:
# rfClf_10 = RandomForestClassifier(n_estimators=10)
# rfClf_10.fit(X_train_features, y_train)
ypred_RF_16 = rfClf_10.predict(X_2016_features)

print "RANDOM FOREST ACCURACY SCORE", accuracy_score(y_2016, ypred_RF_16)

RANDOM FOREST ACCURACY SCORE 0.0983606557377


### Conclusions: 

These models are drastically overfit! They would benefit from increased regularization so that they generalize better. It's also worth considering that a classification technique would be better with fewer numerical targets. Instead of trying to map to values 1-10, a simpler classification scheme might map to values above 7 or under 7 and call these "good" or "bad". 