In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df=pd.read_csv("/content/drive/MyDrive/Datasets/Document.csv",names=["text","label"])

In [5]:
df

Unnamed: 0,text,label
0,I love this sandwich,pos
1,This is an amazing place,pos
2,I feel very good about these places,pos
3,This is my best work,pos
4,What an awesome view,pos
5,I do not like this restaurant,neg
6,I am tired of this stuff,neg
7,I can't deal with this,neg
8,He is my sworn enemy,neg
9,My boss is horrible,neg


In [6]:
df.shape

(18, 2)

In [7]:
df.label

0     pos
1     pos
2     pos
3     pos
4     pos
5     neg
6     neg
7     neg
8     neg
9     neg
10    pos
11    neg
12    pos
13    neg
14    pos
15    neg
16    pos
17    neg
Name: label, dtype: object

In [8]:
df["numlabel"]=df.label.map({"pos":1,"neg":0})

In [9]:
df

Unnamed: 0,text,label,numlabel
0,I love this sandwich,pos,1
1,This is an amazing place,pos,1
2,I feel very good about these places,pos,1
3,This is my best work,pos,1
4,What an awesome view,pos,1
5,I do not like this restaurant,neg,0
6,I am tired of this stuff,neg,0
7,I can't deal with this,neg,0
8,He is my sworn enemy,neg,0
9,My boss is horrible,neg,0


In [10]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()

In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [13]:
corpus = []
for i in range(len(df)):
    data = df['text'][i].lower()
    data = data.split()
    data = [wl.lemmatize(word, pos='v') for word in data if not word in stopwords.words('english')]
    data = ' '.join(data)
    corpus.append(data)
    print(i, end = ' ')

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 

In [14]:
corpus

['love sandwich',
 'amaze place',
 'feel good place',
 'best work',
 'awesome view',
 'like restaurant',
 'tire stuff',
 "can't deal",
 'swear enemy',
 'boss horrible',
 'awesome place',
 'like taste juice',
 'love dance',
 'sick tire place',
 'great holiday',
 'bad locality stay',
 'good fun tomorrow',
 "go enemy's house today"]

In [16]:
df['processed_text'] = corpus
df

Unnamed: 0,text,label,numlabel,processed_text
0,I love this sandwich,pos,1,love sandwich
1,This is an amazing place,pos,1,amaze place
2,I feel very good about these places,pos,1,feel good place
3,This is my best work,pos,1,best work
4,What an awesome view,pos,1,awesome view
5,I do not like this restaurant,neg,0,like restaurant
6,I am tired of this stuff,neg,0,tire stuff
7,I can't deal with this,neg,0,can't deal
8,He is my sworn enemy,neg,0,swear enemy
9,My boss is horrible,neg,0,boss horrible


In [17]:
cv = CountVectorizer()
X_cv = cv.fit_transform(corpus)
X_cv = X_cv.toarray()
X_cv

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0

In [18]:
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_cv, list(df['numlabel']), test_size = 0.2, random_state = 0)

In [19]:
model_mnb_cv = MultinomialNB()
model_mnb_cv.fit(X_train_cv, y_train_cv)
y_pred_mnb_cv = model_mnb_cv.predict(X_test_cv)

In [20]:
from sklearn.metrics import accuracy_score
print ("Accuracy using Count Vectorizer and Multinomial Naive Bayes: ", accuracy_score(y_test_cv, y_pred_mnb_cv))

Accuracy using Count Vectorizer and Multinomial Naive Bayes:  1.0
