In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [2]:
df = pd.read_csv('reviews.csv').drop('Unnamed: 0', axis=1)

In [3]:
df.head()

Unnamed: 0,rating,title,text
0,10,Some birds aren't meant to be caged.\n,The Shawshank Redemption is written and direct...
1,10,Tied for the best movie I have ever seen\n,Why do I want to write the 234th comment on Th...
2,10,An incredible movie. One that lives with you.\n,It is no wonder that the film has such a high ...
3,10,Don't Rent Shawshank.\n,I'm trying to save you money; this is the last...
4,10,This is How Movies Should Be Made\n,This movie is not your ordinary Hollywood flic...


In [4]:
df.describe()

Unnamed: 0,rating
count,70196.0
mean,7.869708
std,2.877414
min,1.0
25%,7.0
50%,9.0
75%,10.0
max,10.0


In [5]:
df.shape

(70196, 3)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70196 entries, 0 to 70195
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   rating  70196 non-null  int64 
 1   title   70196 non-null  object
 2   text    70196 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.6+ MB


In [7]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [8]:
df['text_lemmatizer'] = df.text.apply(lemmatize_text)

In [9]:
df.head()

Unnamed: 0,rating,title,text,text_lemmatizer
0,10,Some birds aren't meant to be caged.\n,The Shawshank Redemption is written and direct...,"[The, Shawshank, Redemption, is, written, and,..."
1,10,Tied for the best movie I have ever seen\n,Why do I want to write the 234th comment on Th...,"[Why, do, I, want, to, write, the, 234th, comm..."
2,10,An incredible movie. One that lives with you.\n,It is no wonder that the film has such a high ...,"[It, is, no, wonder, that, the, film, ha, such..."
3,10,Don't Rent Shawshank.\n,I'm trying to save you money; this is the last...,"[I'm, trying, to, save, you, money;, this, is,..."
4,10,This is How Movies Should Be Made\n,This movie is not your ordinary Hollywood flic...,"[This, movie, is, not, your, ordinary, Hollywo..."


In [10]:
df['text_lemmatizer'] = [' '.join(map(str, l)) for l in df['text_lemmatizer']]

In [11]:
df.drop('title', axis=1, inplace=True)

In [12]:
df.head()

Unnamed: 0,rating,text,text_lemmatizer
0,10,The Shawshank Redemption is written and direct...,The Shawshank Redemption is written and direct...
1,10,Why do I want to write the 234th comment on Th...,Why do I want to write the 234th comment on Th...
2,10,It is no wonder that the film has such a high ...,It is no wonder that the film ha such a high r...
3,10,I'm trying to save you money; this is the last...,I'm trying to save you money; this is the last...
4,10,This movie is not your ordinary Hollywood flic...,This movie is not your ordinary Hollywood flic...


In [13]:
df.rating

0        10
1        10
2        10
3        10
4        10
         ..
70191     1
70192     8
70193     2
70194     5
70195     7
Name: rating, Length: 70196, dtype: int64

In [45]:
def assign_label(rating):
    if rating <= 9:
        return 0
    return 1

In [47]:
df['label'] = df['rating'].apply(assign_label)

In [48]:
df.head()

Unnamed: 0,rating,text_lemmatizer,label
0,10,The Shawshank Redemption is written and direct...,1
1,10,Why do I want to write the 234th comment on Th...,1
2,10,It is no wonder that the film ha such a high r...,1
3,10,I'm trying to save you money; this is the last...,1
4,10,This movie is not your ordinary Hollywood flic...,1


In [49]:
df.drop('text', axis=1, inplace=True)

KeyError: "['text'] not found in axis"

In [50]:
X, y = df.drop('label', axis=1), df['label']

In [51]:
df2 = X.copy().drop('rating', axis=1)

In [52]:
df2

Unnamed: 0,text_lemmatizer
0,The Shawshank Redemption is written and direct...
1,Why do I want to write the 234th comment on Th...
2,It is no wonder that the film ha such a high r...
3,I'm trying to save you money; this is the last...
4,This movie is not your ordinary Hollywood flic...
...,...
70191,This is a mind numbing movie. Boring OVERLY pr...
70192,Imagine if you could relive every date that yo...
70193,This movie is typical of the writing and today...
70194,Jordan Fisher's acting is the only reason I ga...


In [53]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df2, y, test_size=0.2, random_state = 0)

In [54]:
X_train

Unnamed: 0,text_lemmatizer
6110,This review involves some unavoidable spoilers...
8844,The Shawshank Redemption begin with young Andy...
60033,"OK, so... I'm ""tainted"" by the old version fro..."
66049,Best film i ever watch at final everyoane star...
31980,Never before seen a movie which have this beau...
...,...
21243,What is the most resilient parasite? An Idea! ...
45891,The Return of the King is the best movie ever ...
42613,I rate 10 star for movie that I watch again an...
43567,The Godfather is most people favorite movie of...


In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
tfidf = TfidfVectorizer()

tfidf.fit(X_train['text_lemmatizer'].values)

tfidf_matrix = tfidf.transform(X_train['text_lemmatizer'].values)

In [57]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

clf = nb.fit(tfidf_matrix, y_train)

In [58]:
clf.predict(tfidf.transform(['I hate this movie.']))

array([0], dtype=int64)

In [61]:
from sklearn.tree import DecisionTreeClassifier

In [62]:
clf = DecisionTreeClassifier(min_samples_split=50).fit(tfidf_matrix, y_train)

In [63]:
clf.predict(tfidf.transform(['I hate this movie.']))

array([0], dtype=int64)

In [59]:
import xgboost

xgb = xgboost.XGBClassifier().fit(tfidf_matrix, y_train)

In [60]:
xgb.predict(tfidf.transform(['I hate this movie']))

array([1])

In [64]:
from sklearn.ensemble import RandomForestClassifier

In [67]:
clf = RandomForestClassifier(n_estimators=45).fit(tfidf_matrix, y_train)

In [68]:
clf.predict(tfidf.transform(["I hate this movie"]))

array([1], dtype=int64)