## DATA PREPROCESSING

In [5]:
import numpy as np
import pandas as pd

In [6]:
df=pd.read_csv('datasets/reviews.csv')

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

### Tasks for data cleaning
- sample is reduced to 10000
- remove html tags
- remove special characters
- convert to lower case
- remove stopwords
- stemming

In [9]:
df=df.sample(10000)

In [10]:
df.shape

(10000, 2)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 49060 to 30892
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


In [12]:
df['sentiment'].replace({'positive':1,'negative':0}, inplace=True)

In [13]:
df.head()

Unnamed: 0,review,sentiment
49060,"Unentertaining, uninvolving hybrid of ""Cruel I...",0
76,The Last Hard Men finds James Coburn an outlaw...,1
43507,It's hard to know what was going through Per K...,0
26702,"I have heard about this novel a long time ago,...",0
20963,Enjoyed 'Den brysomme mannen' http://ow.ly/PTT...,1


In [14]:
import re
clean=re.compile('<.*?>')
re.sub(clean, '', df.iloc[2].review)

'It\'s hard to know what was going through Per Kristensen and Morten Lindberg\'s heads when they wrote "Gayniggers from Outer Space" - the movie is billed as a comedy, yet there are no real jokes beside the crude character names (Capt. B. Dick, Sgt. Shaved Balls). The rest of the movie is a (presumably) unintentionally funny affair with ridiculously unsynchronised voice-overs (with the \'actors\' basically reading their lines with no hint of feeling), \'futuristic\' computer displays filled with spelling mistakes, and a plot that makes almost no sense.Even though 65% of viewers have given this movie a perfect 10 out of 10, this is the complete opposite of what a good film is. It may be ENTERTAINING to watch with some friends, but this film only deserves about a \'2\' out of ten...slightly higher than the lower possible rating only because of the sheer fact that the writers somehow managed to get some black guys to star in this movie.'

In [15]:
# function to clean html tags
def clean_html(text):
    clean=re.compile('<.*?>')
    return re.sub(clean, '', text)

In [16]:
df['review']=df['review'].apply(clean_html)

In [17]:
# function to convert to lower case
def convert_lower(text):
    return text.lower()

In [18]:
df['review']=df['review'].apply(convert_lower)

In [19]:
# function to remove special characters
def remove_special(text):
    x= ''
    for i in text:
        if i.isalnum():
            x = x+i
        else:
            x = x+' '
    return x

In [20]:
df['review']=df['review'].apply(remove_special)

In [21]:
import nltk
from nltk.corpus import stopwords

In [22]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [23]:
# function for removing stopwords
def remove_stopwords(text):
    x=[]
    for i in text.split():
        if i not in stopwords.words('english'):
            x.append(i)
    y=x[:]
    x.clear()
    
    return y

In [24]:
df['review']=df['review'].apply(remove_stopwords)
df

Unnamed: 0,review,sentiment
49060,"[unentertaining, uninvolving, hybrid, cruel, i...",0
76,"[last, hard, men, finds, james, coburn, outlaw...",1
43507,"[hard, know, going, per, kristensen, morten, l...",0
26702,"[heard, novel, long, time, ago, many, friends,...",0
20963,"[enjoyed, den, brysomme, mannen, http, ow, ly,...",1
...,...,...
3402,"[well, made, film, set, early, 60s, communist,...",1
8375,"[really, enjoyed, film, tremendous, interest, ...",1
10498,"[like, singin, rain, cover, girl, trio, two, g...",0
4880,"[friend, mine, loves, tacky, horror, films, of...",0


In [25]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [26]:
# function for stemming
y=[]
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    
    return z
    

In [27]:
df['review']=df['review'].apply(stem_words)
df

Unnamed: 0,review,sentiment
49060,"[unentertain, uninvolv, hybrid, cruel, intent,...",0
76,"[last, hard, men, find, jame, coburn, outlaw, ...",1
43507,"[hard, know, go, per, kristensen, morten, lind...",0
26702,"[heard, novel, long, time, ago, mani, friend, ...",0
20963,"[enjoy, den, brysomm, mannen, http, ow, ly, pt...",1
...,...,...
3402,"[well, made, film, set, earli, 60, communist, ...",1
8375,"[realli, enjoy, film, tremend, interest, ameri...",1
10498,"[like, singin, rain, cover, girl, trio, two, g...",0
4880,"[friend, mine, love, tacki, horror, film, ofte...",0


In [28]:
# join
def join(list_input):
    return " ".join(list_input)

In [29]:
df['review']=df['review'].apply(join)
df

Unnamed: 0,review,sentiment
49060,unentertain uninvolv hybrid cruel intent wild ...,0
76,last hard men find jame coburn outlaw long sen...,1
43507,hard know go per kristensen morten lindberg he...,0
26702,heard novel long time ago mani friend recommen...,0
20963,enjoy den brysomm mannen http ow ly pttp wife ...,1
...,...,...
3402,well made film set earli 60 communist yugoslav...,1
8375,realli enjoy film tremend interest american hi...,1
10498,like singin rain cover girl trio two guy girl ...,0
4880,friend mine love tacki horror film often get s...,0


### TFIDF vectorizer

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['review']).toarray()
tfidf_matrix.shape


NameError: name 'df' is not defined

In [31]:
tfidf_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [32]:
tfidf_matrix[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [33]:
tfidf_matrix[0].max()

0.35522638751670155

In [34]:
y=df.iloc[:,-1].values
y

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.20)

NameError: name 'tfidf_matrix' is not defined

In [1]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

NameError: name 'X_train' is not defined

In [89]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
cl1 = GaussianNB()
cl2 = MultinomialNB()
cl3 = BernoulliNB()

In [90]:
cl1.fit(X_train,y_train)
cl2.fit(X_train,y_train)
cl3.fit(X_train,y_train)

BernoulliNB()

In [92]:
from sklearn.metrics import accuracy_score
print("GaussianNB ",accuracy_score(y_test,cl1.predict(X_test)))
print("MultinomialNB ",accuracy_score(y_test,cl2.predict(X_test)))
print("BernoulliNB ",accuracy_score(y_test,cl3.predict(X_test)))

GaussianNB  0.617
MultinomialNB  0.845
BernoulliNB  0.83


### Bag of words count vectorizer

In [93]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer() # change the values

In [94]:
X=cv.fit_transform(df['review']).toarray()

In [95]:
X.shape

(10000, 36198)

In [96]:
Y=df.iloc[:,-1].values

In [97]:
Y.shape

(10000,)

In [98]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)

In [99]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

In [100]:
clf1 = GaussianNB()
clf2 = MultinomialNB()
clf3 = BernoulliNB()

In [101]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)

BernoulliNB()

In [102]:
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)
y_pred3 = clf3.predict(X_test)

In [103]:
from sklearn.metrics import accuracy_score

In [104]:
print("GaussianNB ",accuracy_score(y_test,y_pred1))
print("MultinomialNB ",accuracy_score(y_test,y_pred2))
print("BernoulliNB ",accuracy_score(y_test,y_pred3))

GaussianNB  0.628
MultinomialNB  0.846
BernoulliNB  0.853
