# Read data

In [33]:
import pandas as pd
df_train=pd.read_csv("movie_review_train.csv")
df_test=pd.read_csv("movie_review_test.csv")
print(df_train.head())
print(df_test.head())

  class                                               text
0   Pos   a common complaint amongst film critics is   ...
1   Pos   whew   this film oozes energy   the kind of b...
2   Pos   steven spielberg s   amistad     which is bas...
3   Pos   he has spent his entire life in an awful litt...
4   Pos   being that it is a foreign language film with...
  class                                               text
0   Pos   films adapted from comic books have had plent...
1   Pos   every now and then a movie comes along from a...
2   Pos   you ve got mail works alot better than it des...
3   Pos      jaws   is a rare film that grabs your atte...
4   Pos   moviemaking is a lot like being the general m...


# Identify and Clean up data

In [34]:
def convert(x):
    d={"Pos":1,"Neg":0}
    return d[x]

df_train["label"]=df_train["class"].apply(convert)
print(df_train.head())
print()
df_test["label"]=df_test["class"].apply(convert)
print(df_test.head())
print()
df_test=df_test.drop("class",axis=1)
print(df_test.head())
print()
x=df_test["text"]
y=df_test["label"]
print(x.shape)
print(y.shape)

  class                                               text  label
0   Pos   a common complaint amongst film critics is   ...      1
1   Pos   whew   this film oozes energy   the kind of b...      1
2   Pos   steven spielberg s   amistad     which is bas...      1
3   Pos   he has spent his entire life in an awful litt...      1
4   Pos   being that it is a foreign language film with...      1

  class                                               text  label
0   Pos   films adapted from comic books have had plent...      1
1   Pos   every now and then a movie comes along from a...      1
2   Pos   you ve got mail works alot better than it des...      1
3   Pos      jaws   is a rare film that grabs your atte...      1
4   Pos   moviemaking is a lot like being the general m...      1

                                                text  label
0   films adapted from comic books have had plent...      1
1   every now and then a movie comes along from a...      1
2   you ve got mail works 

# Train and Split data

In [35]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1)
print(x_train.head())
print()
print(x_test.head())


82      a big surprise to me    the good trailer had ...
367     so   it s thirty years later   and oscar and ...
179     no filmmaker deconstructs a story as well as ...
27      the most common   and in many cases the only ...
89      robert benton has assembled a stellar   matur...
Name: text, dtype: object

398     this independent film written and directed by...
125     mimi leder is probably best known for her stu...
328     susan granger s review of   ghosts of mars   ...
339     if snake eyes were a dog   you d put it to sl...
172     a standoff    a man holds a woman   a diploma...
Name: text, dtype: object


# Feature extraction feature

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
vect=CountVectorizer(stop_words="english")
vect.fit(x)
print(vect.vocabulary_," ; ",len(vect.vocabulary_))
print()
dict=CountVectorizer(stop_words="english",min_df=0.3,max_df=0.8)
dict.fit(x)




# Get feature and transform

In [37]:
x_train_transformed_dict=dict.get_feature_names_out()
print(len(x_train_transformed_dict))
print()
x_train_transformed=vect.transform(x_train)
x_test_transformed=vect.transform(x_test)
print(type(x_train_transformed))
print(x_test_transformed)


46

<class 'scipy.sparse._csr.csr_matrix'>
  (0, 3)	1
  (0, 11)	1
  (0, 61)	1
  (0, 96)	1
  (0, 271)	1
  (0, 440)	2
  (0, 448)	1
  (0, 555)	1
  (0, 694)	1
  (0, 792)	1
  (0, 1025)	1
  (0, 1059)	2
  (0, 1308)	1
  (0, 1492)	1
  (0, 1688)	1
  (0, 1691)	1
  (0, 1726)	1
  (0, 1769)	1
  (0, 1863)	1
  (0, 1872)	1
  (0, 2006)	1
  (0, 2056)	1
  (0, 2080)	1
  (0, 2197)	2
  (0, 2213)	1
  :	:
  (99, 16701)	1
  (99, 16903)	1
  (99, 17075)	1
  (99, 17096)	1
  (99, 17203)	2
  (99, 17255)	1
  (99, 17263)	1
  (99, 17416)	1
  (99, 17519)	1
  (99, 17573)	1
  (99, 17578)	1
  (99, 17741)	1
  (99, 17914)	1
  (99, 18021)	1
  (99, 18027)	1
  (99, 18149)	2
  (99, 18234)	1
  (99, 18462)	1
  (99, 18528)	1
  (99, 18576)	1
  (99, 18629)	4
  (99, 18818)	1
  (99, 18922)	1
  (99, 18924)	1
  (99, 19036)	1


# Naive Bayes

In [40]:

from sklearn.naive_bayes import BernoulliNB
mnb=BernoulliNB()
mnb.fit(x_train_transformed,y_train)
y_pred_class=mnb.predict(x_test_transformed)
y_pred_probab=mnb.predict_proba(x_test_transformed)
print(mnb)
print()
print("Length of y_test : ",len(y_test),'\n',"lenght of Predicted : ",len(y_pred_class))


BernoulliNB()

Length of y_test :  100 
 lenght of Predicted :  100


# Evaluation metrics

In [39]:

from sklearn import metrics
metrics.accuracy_score(y_test,y_pred_class)
metrics.confusion_matrix(y_test,y_pred_class)
confusion=metrics.confusion_matrix(y_test,y_pred_class[:100])
print("Confusion matrix : ",confusion)
print()
review=vect.transform(["Good Movie"])
res=mnb.predict(review)
print(res)

Confusion matrix :  [[43 15]
 [13 29]]

[0]
