In [229]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

### Data reading

In [230]:
data = pd.read_csv("Sentiments.csv", sep=';', names = ['Opinion', 'Sentiment'],  encoding= 'unicode_escape')
df = pd.DataFrame(data)
# shows top 10 rows
df.head(10)

Unnamed: 0,Opinion,Sentiment
0,This is wanderful movie,positive
1,I love this movie,positive
2,It is realy great,positive
3,The movie was horrible,negative
4,I do not recommend,negative
5,It was so bad,negative
6,Never again,negative
7,it is ok,positive
8,it is big pleasure,positive
9,I had never watched so poor movie,negative


### How many cells with positive sentiment?

In [231]:
len(df[df.Sentiment == 'positive'])

17

### Change positive to 1 and negative to 0

In [232]:
df.loc[df['Sentiment'] == 'positive', 'Sentiment']=1
df.loc[df['Sentiment'] == 'negative', 'Sentiment']=0

In [233]:
df.head()

Unnamed: 0,Opinion,Sentiment
0,This is wanderful movie,1
1,I love this movie,1
2,It is realy great,1
3,The movie was horrible,0
4,I do not recommend,0


### Separate Opinion from Sentiment

In [234]:
df_x =df['Opinion']
df_y = df['Sentiment']

In [235]:
df_x

0                              This is wanderful movie 
1                                     I love this movie
2                                     It is realy great
3                                The movie was horrible
4                                    I do not recommend
5                                         It was so bad
6                                           Never again
7                                              it is ok
8                                    it is big pleasure
9                     I had never watched so poor movie
10           I had so much pleasure watching this movie
11                                        It was a crap
12                    this thing is absolute cheap crap
13                              waste of money and time
14    This movie is an amazing depiction of everythi...
15                          Predictable and pretentious
16    Great film watched it twice, A mind blowing st...
17    his film really deserves five stars for ex

### CountVectorizer


In [236]:
vectorizer = CountVectorizer()

### Dividing into subsets of training data and test data

In [237]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.25, random_state = 4)

### Changing text into matrix

In [238]:
x_train_countvectorizer = vectorizer.fit_transform(x_train)

In [239]:
print(x_train[0])
print(x_train_countvectorizer[0])

This is wanderful movie 
  (0, 95)	1
  (0, 50)	2
  (0, 25)	1
  (0, 76)	1
  (0, 74)	1
  (0, 87)	1
  (0, 5)	1
  (0, 17)	1
  (0, 82)	1
  (0, 79)	1
  (0, 83)	1


In [240]:
x_train_countvectorizer.toarray()


array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [1, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Clasification

In [241]:
mnb = MultinomialNB()

In [242]:
y_train = y_train.astype('int')

### Fit vector to sentiment

In [243]:
mnb.fit(x_train_countvectorizer, y_train) 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Change test opinion into wector

In [244]:
x_test_countvectorizer = vectorizer.transform(x_test)

### Predict using clasifier test data

In [245]:
pred = mnb.predict(x_test_countvectorizer)

In [246]:
pred

array([0, 1, 1, 1, 0, 1, 1, 1])

### Change into array test sentiments

In [247]:
rezult = np.array(y_test)

In [248]:
rezult

array([0, 1, 1, 0, 0, 1, 1, 1], dtype=object)

#### For all predicted data

In [249]:
count = 0

In [250]:
for i in range(len(pred)):
    if pred[i] == rezult[i]:
        count += 1        

In [251]:
count

7

In [252]:
len(pred)

8

#### Precyzja metody pomiaru – stopień zgodności między wynikami uzyskanymi w określonych warunkach z wielokrotnych pomiarów tej samej wielkości.

In [253]:
accuracy = count/len(pred)

In [254]:
accuracy

0.875