In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

### Data cleaning function

In [2]:
import string
import spacy
from nltk.corpus import stopwords
import re

english_stopwords = stopwords.words('english')
nlp = spacy.load('en_core_web_sm')
punctuation = set(string.punctuation)

def data_cleaner(sentence):
    sentence = sentence.lower()
    for c in string.punctuation:
        sentence = sentence.replace(c, " ")
    document = nlp(sentence)
    sentence = ' '.join(token.lemma_ for token in document)
    sentence = ' '.join(word for word in sentence.split() if word not in english_stopwords)
    sentence = re.sub('\d', '', sentence)
    
    return sentence

## Import dataset

In [3]:
URL = "https://raw.githubusercontent.com/ProfAI/natural-language-processing/main/datasets/Lezione_5-sentiment_Analysis/"

In [6]:
dataset = pd.read_csv(URL+"IMDB%20Dataset.csv")
dataset

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [11]:
X = dataset["review"]
X.shape

(50000,)

In [12]:
y = dataset["sentiment"]
y.shape

(50000,)

In [14]:
print(len(y[y == 'positive']))
print(len(y[y == 'negative']))

25000
25000


## Data Cleaning

In [15]:
X_cleaned = []
for i in X:
    X_cleaned.append(data_cleaner(i))

In [16]:
X_cleaned

['one reviewer mention watch  oz episode hook right exactly happen I br br first thing strike I oz brutality unflinche scene violence set right word go trust I show faint hearted timid show pull punch regard drug sex violence hardcore classic use word br br call oz nickname give oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inward privacy high agenda em city home many aryans muslim gangsta latinos christian italian irish scuffle death stare dodgy dealing shady agreement never far away br br I would say main appeal show due fact go show dare forget pretty picture paint mainstream audience forget charm forget romance oz mess around first episode I ever see strike I nasty surreal I say I ready I watch I develop taste oz get accustomed high level graphic violence violence injustice crooked guard sell nickel inmate kill order get away well mannered middle class inmate turn prison bitch due lack street skill prison exper

#### Train Test Split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y, test_size=0.20, random_state=2)

#### Vectorize text

In [18]:
vec = CountVectorizer()

X_train = vec.fit_transform(X_train).toarray()
X_test = vec.transform(X_test).toarray()

## Model training - Naive Bayes 

In [19]:
model = MultinomialNB()

model.fit(X_train, y_train)

MultinomialNB()

### Model evaluation

In [20]:
model.score(X_test, y_test)

0.8601

In [21]:
sentence = "This notebook is very beautiful, fun and simple."
sentence_cleaned = data_cleaner(sentence)
sentence_countv = vec.transform([sentence_cleaned])
model.predict(sentence_countv)

array(['positive'], dtype='<U8')

In [22]:
model.predict_proba(sentence_countv)

array([[0.10886856, 0.89113144]])

In [23]:
sentence = "This notebook is useless and ugly."
sentence_cleaned = data_cleaner(sentence)
sentence_countv = vec.transform([sentence_cleaned])
model.predict(sentence_countv)

array(['negative'], dtype='<U8')

In [24]:
model.predict_proba(sentence_countv)

array([[0.89856741, 0.10143259]])