In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [3]:
df = pd.read_csv('reviews.csv').drop('Unnamed: 0', axis=1)

In [4]:
df.head()

Unnamed: 0,rating,title,text
0,10,Some birds aren't meant to be caged.\n,The Shawshank Redemption is written and direct...
1,10,Tied for the best movie I have ever seen\n,Why do I want to write the 234th comment on Th...
2,10,An incredible movie. One that lives with you.\n,It is no wonder that the film has such a high ...
3,10,Don't Rent Shawshank.\n,I'm trying to save you money; this is the last...
4,10,This is How Movies Should Be Made\n,This movie is not your ordinary Hollywood flic...


In [5]:
df.describe()

Unnamed: 0,rating
count,70196.0
mean,7.869708
std,2.877414
min,1.0
25%,7.0
50%,9.0
75%,10.0
max,10.0


In [6]:
df.shape

(70196, 3)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70196 entries, 0 to 70195
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   rating  70196 non-null  int64 
 1   title   70196 non-null  object
 2   text    70196 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.6+ MB


In [8]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [9]:
df['text_lemmatizer'] = df.text.apply(lemmatize_text)

In [10]:
df.head()

Unnamed: 0,rating,title,text,text_lemmatizer
0,10,Some birds aren't meant to be caged.\n,The Shawshank Redemption is written and direct...,"[The, Shawshank, Redemption, is, written, and,..."
1,10,Tied for the best movie I have ever seen\n,Why do I want to write the 234th comment on Th...,"[Why, do, I, want, to, write, the, 234th, comm..."
2,10,An incredible movie. One that lives with you.\n,It is no wonder that the film has such a high ...,"[It, is, no, wonder, that, the, film, ha, such..."
3,10,Don't Rent Shawshank.\n,I'm trying to save you money; this is the last...,"[I'm, trying, to, save, you, money;, this, is,..."
4,10,This is How Movies Should Be Made\n,This movie is not your ordinary Hollywood flic...,"[This, movie, is, not, your, ordinary, Hollywo..."


In [11]:
df['text_lemmatizer'] = [' '.join(map(str, l)) for l in df['text_lemmatizer']]

In [13]:
df.drop('title', axis=1, inplace=True)

In [14]:
df.head()

Unnamed: 0,rating,text,text_lemmatizer
0,10,The Shawshank Redemption is written and direct...,The Shawshank Redemption is written and direct...
1,10,Why do I want to write the 234th comment on Th...,Why do I want to write the 234th comment on Th...
2,10,It is no wonder that the film has such a high ...,It is no wonder that the film ha such a high r...
3,10,I'm trying to save you money; this is the last...,I'm trying to save you money; this is the last...
4,10,This movie is not your ordinary Hollywood flic...,This movie is not your ordinary Hollywood flic...


In [15]:
df.rating

0        10
1        10
2        10
3        10
4        10
         ..
70191     1
70192     8
70193     2
70194     5
70195     7
Name: rating, Length: 70196, dtype: int64

In [16]:
def assign_label(rating):
    if rating <= 5:
        return 0
    return 1

In [17]:
df['label'] = df['rating'].apply(assign_label)

In [18]:
df.head()

Unnamed: 0,rating,text,text_lemmatizer,label
0,10,The Shawshank Redemption is written and direct...,The Shawshank Redemption is written and direct...,1
1,10,Why do I want to write the 234th comment on Th...,Why do I want to write the 234th comment on Th...,1
2,10,It is no wonder that the film has such a high ...,It is no wonder that the film ha such a high r...,1
3,10,I'm trying to save you money; this is the last...,I'm trying to save you money; this is the last...,1
4,10,This movie is not your ordinary Hollywood flic...,This movie is not your ordinary Hollywood flic...,1
