## IMDB dataset, Sentiment Analysis

In [2]:
#import packages
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 


In [3]:
# load Dataset
text = pd.read_csv('train.csv')

In [4]:
text.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [5]:
text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


### data analysis
 to make sure whether the data is balance or not

In [6]:
text['label'].value_counts()

label
0    20019
1    19981
Name: count, dtype: int64

## Data Cleaning

In [7]:
#Removing Punctuation
def cleaning (text):
    text = text.lower()
    text =  re.sub(r'@\S+', '',text)  # remove twitter handles
    text =  re.sub(r'http\S+', '',text) # remove urls
    text =  re.sub(r'pic.\S+', '',text)
    text =  re.sub(r"[^a-zA-ZáéíóúÁÉÍÓÚ']", ' ',text) # only keeps characters
    text =  re.sub(r'\s+[a-zA-ZáéíóúÁÉÍÓÚ]\s+', ' ', text+' ')  # keep words with len

    
    return text

In [8]:
# apply the function to our data
text['text'] = text['text'].apply(cleaning)

In [9]:
text['text']

0        i grew up watching and loving the thunderbirds...
1        when put this movie in my dvd player  and sat ...
2        why do people who do not know what particular ...
3        even though have great interest in biblical mo...
4        im die hard dads army fan and nothing will eve...
                               ...                        
39995     western union  is something of forgotten clas...
39996    this movie is an incredible piece of work  it ...
39997    my wife and watched this movie because we plan...
39998    when first watched flatliners was amazed  it h...
39999    why would this film be so good  but only gross...
Name: text, Length: 40000, dtype: object

In [10]:
#train, test split
X_train, X_test, y_train, y_test = train_test_split(text.text, text.label)

## Feature Extraction

In [11]:
# count the words and store it in a matrix
vectorizer = TfidfVectorizer()
X_train_count = vectorizer.fit_transform(X_train.values)

In [12]:
X_train_count.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# train model


### Naive Bayes

In [13]:
model = MultinomialNB()

In [14]:
model.fit(X_train_count, y_train)

In [16]:
# pre-test 1
text1 = ['this was very good and cool']
text1_count = vectorizer.transform(text1)
model.predict(text1_count)
# the 1 array shows that the model evaluate this text as a posetive sentence

array([1])

In [17]:
# pre-test 0 
text2 = ['i didnt like that, it was boring and i just waste my time']
text2_count = vectorizer.transform(text2)
model.predict(text2_count)
# the 0 array shows that the model evaluate this text as a negative sentence

array([0])

In [18]:
# Test Model(Naive bayes)
X_test_count = vectorizer.transform(X_test)
pred = model.predict(X_test_count)

In [19]:
print(accuracy_score(pred, y_test))

0.8643


### LogisticRegression

In [20]:
lr = LogisticRegression()
lr.fit(X_train_count, y_train)

In [21]:
### Test Model(LogisticRegression)
X_test_count = vectorizer.transform(X_test)
pred = lr.predict(X_test_count)

In [22]:
print(accuracy_score(pred, y_test))

0.8917
