# Aim:
I am curious to know wheather fresh or rotten tomatos can actually reflect reviews from critics.

# 1. Data import and preparation

In [3]:
import pandas as pd
df = pd.read_csv('critics.csv')

In [4]:
df.head()

Unnamed: 0,critic,fresh,imdb,publication,quote,review_date,rtid,title
0,Owen Gleiberman,fresh,114709,Entertainment Weekly,,2011-09-07,9559,Toy story
1,Derek Adams,fresh,114709,Time Out,"So ingenious in concept, design and execution ...",2009-10-04,9559,Toy story
2,Richard Corliss,fresh,114709,TIME Magazine,The year's most inventive comedy.,2008-08-31,9559,Toy story
3,David Ansen,fresh,114709,Newsweek,A winning animated feature that has something ...,2008-08-18,9559,Toy story
4,Leonard Klady,fresh,114709,Variety,The film sports a provocative and appealing st...,2008-06-09,9559,Toy story


In [5]:
n_reviews = len(df)
n_critics = df.critic.unique().size
n_movies = df.rtid.unique().size

print("Number of reviews: {:d}".format(n_reviews))
print("Number of critics: {:d}".format(n_critics))
print("Number of movies:  {:d}".format(n_movies))

Number of reviews: 27631
Number of critics: 690
Number of movies:  2779


### Remove nulls in quote column

In [6]:
df = df[~df['quote'].isnull()]

In [7]:
df.head()

Unnamed: 0,critic,fresh,imdb,publication,quote,review_date,rtid,title
1,Derek Adams,fresh,114709,Time Out,"So ingenious in concept, design and execution ...",2009-10-04,9559,Toy story
2,Richard Corliss,fresh,114709,TIME Magazine,The year's most inventive comedy.,2008-08-31,9559,Toy story
3,David Ansen,fresh,114709,Newsweek,A winning animated feature that has something ...,2008-08-18,9559,Toy story
4,Leonard Klady,fresh,114709,Variety,The film sports a provocative and appealing st...,2008-06-09,9559,Toy story
5,Jonathan Rosenbaum,fresh,114709,Chicago Reader,"An entertaining computer-generated, hyperreali...",2008-03-10,9559,Toy story


In [8]:
n_reviews = len(df)
n_critics = df.critic.unique().size
n_movies = df.rtid.unique().size

print("Number of reviews: {:d}".format(n_reviews))
print("Number of critics: {:d}".format(n_critics))
print("Number of movies:  {:d}".format(n_movies))

Number of reviews: 15561
Number of critics: 623
Number of movies:  1921


### Vectorize reviews from quote column by "Bag of Words" as X
### Prepare labeled targets from fresh column as y

In [9]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
X = vect.fit_transform(df.quote)
y = (df.fresh == 'fresh').values.astype(np.int)

### Vectorize reviews from quote column by "Tfidf" as X
### Prepare labeled targets from fresh column as y

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect_tfidf = TfidfVectorizer(min_df=5)
X_tfidf = vect_tfidf.fit_transform(df.quote)
y_tfidf = (df.fresh == 'fresh').values.astype(np.int)

### 60% of reviews are positive in labeled targets

In [26]:
y.sum() / len(y)

0.6093438725017672

### Training and testing data from X and y

In [10]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=1, stratify=y)

### Training and testing data from X_tfidf and y_tfidf

In [38]:
from sklearn.model_selection import train_test_split
xtrain_tfidf, xtest_tfidf, ytrain_tfidf, ytest_tfidf = train_test_split(X_tfidf, y_tfidf, random_state=1, stratify=y)

# 2. Modeling

### Multinomial Naive Bays with "Bag of Words" data

In [11]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(xtrain, ytrain)

In [12]:
training_accuracy = clf.score(xtrain, ytrain)
test_accuracy = clf.score(xtest, ytest)

print ("Accuracy on training data: %0.2f" % (training_accuracy))
print ("Accuracy on test data:     %0.2f" % (test_accuracy))

Accuracy on training data: 0.92
Accuracy on test data:     0.77


### Logistic regression with "Bag of Words" data

In [15]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression().fit(xtrain, ytrain)



In [16]:
log_predict = log.predict(xtest)

In [21]:
from sklearn.metrics import roc_auc_score

print('AUC: %0.2f' % (roc_auc_score(ytest, log_predict)))

AUC: 0.74


In [18]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = log.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['disappointing' 'worst' 'bland' 'fails' 'disappointment' 'lacks' 'lame'
 'unfortunately' 'unsatisfying' 'save']

Largest Coefs: 
['entertaining' 'rare' 'delight' 'trek' 'masterpiece' 'remarkable'
 'perfect' 'pleasure' 'delightful' 'absorbing']


### Logistic regression with "Tfidf" data

In [39]:
from sklearn.linear_model import LogisticRegression

log_tfidf = LogisticRegression().fit(xtrain_tfidf, ytrain_tfidf)



In [40]:
log_predict_tfidf = log_tfidf.predict(xtest_tfidf)

In [41]:
from sklearn.metrics import roc_auc_score

print('AUC: %0.2f' % (roc_auc_score(ytest_tfidf, log_predict_tfidf)))

AUC: 0.72


In [42]:
# get the feature names as numpy array
feature_names = np.array(vect_tfidf.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = log_tfidf.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]])) # largest to smallest

Smallest Coefs:
['bad' 'too' 'only' 'lacks' 'worst' 'fails' 'unfortunately' 'bland' 'dull'
 'nothing']

Largest Coefs: 
['entertaining' 'and' 'fun' 'masterpiece' 'performance' 'entertainment'
 'perfect' 'rare' 'still' 'best']
