In [107]:
import numpy as np
from nltk.corpus import movie_reviews,stopwords,wordnet
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
import random
import string
from sklearn.metrics import confusion_matrix,classification_report

In [108]:
movie_reviews.categories(),len(movie_reviews.fileids())

(['neg', 'pos'], 2000)

In [109]:
movie_reviews.words(movie_reviews.fileids()[0])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [110]:
n=movie_reviews.words(movie_reviews.fileids()[1])
n,movie_reviews.categories(movie_reviews.fileids()[1])

(['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], ['neg'])

In [111]:
reviews=[]
for file_id in movie_reviews.fileids():
    reviews.append((movie_reviews.words(file_id),movie_reviews.categories(file_id)[0]))
reviews[999]

(['two', 'party', 'guys', 'bob', 'their', 'heads', 'to', ...], 'neg')

In [112]:
random.shuffle(reviews)
x=[]
y=[]
for i in range(len(reviews)):
    x.append(reviews[i][0])
    # y.append(reviews[i][1])
    if reviews[i][1]=='neg':
        y.append(0)
    else:
        y.append(1)

In [113]:
x,y

([['the', 'high', 'school', 'comedy', 'seems', 'to', ...],
  ['expand', 'the', 'final', 'fifteen', 'minutes', 'of', ...],
  ['not', 'since', '1996', "'", 's', 'shine', ',', ...],
  ['perhaps', 'it', "'", 's', 'time', 'for', 'me', 'to', ...],
  ['rated', ':', 'r', 'for', 'strong', 'language', ',', ...],
  ['what', 'i', 'look', 'for', 'in', 'a', 'movie', 'is', ...],
  ['first', 'impressions', ':', 'critically', ',', 'a', ...],
  ['the', '"', 'fab', '4', '"', 'of', 'ronald', 'reagan', ...],
  ['"', 'with', 'all', 'that', 'education', ',', 'you', ...],
  ['john', 'von', 'neumann', ',', 'progenitor', 'of', ...],
  ['i', 'have', 'little', 'against', 'remakes', 'and', ...],
  ['let', 'me', 'open', 'this', 'one', 'with', 'a', ...],
  ['one', 'of', 'the', 'best', 'things', 'about', 'my', ...],
  ['now', 'that', '"', 'boogie', 'nights', '"', 'has', ...],
  ['when', 'i', 'first', 'heard', 'of', 'contact', ',', ...],
  ['the', 'last', 'of', 'vampire', '-', 'films', ...],
  ['"', 'the', 'faculty', 

In [114]:
from itertools import count


class NLP:

    def __init__(self):
        self.features=[]
        self.dictionary={}
        self.stop_words=stopwords.words('english')
        self.stop_words+=list(string.punctuation)

    def probability(self,x,current_class):   # current class is different possible values of y
        output=np.log(self.dictionary[current_class]["total_count"])-np.log(self.dictionary["total_data"])
        feature_count=self.features.shape[0]
        for j in range(feature_count):
            fj=x[j]
            count_current_class_with_feature_fj=self.dictionary[current_class][j+1][fj]+1
            count_current_class=self.dictionary[current_class]["total_count"]+len(self.dictionary[current_class][j+1].keys())
            current_xj_probability=np.log(count_current_class_with_feature_fj) - np.log(count_current_class)
            output=output+current_xj_probability
        return output

    def predictSinglePoint(self,x):
        classes=self.dictionary.keys()
        best_p=-1
        best_class=-1
        first_run=True
        for current_class in classes:       # current class is different possible values of y
            if(current_class=="total_data"):
                continue
            p_current_class=self.probability(x,current_class)
            print("current_class=",current_class," ,  prob=",p_current_class)
            if(first_run==True or p_current_class>best_p):
                best_p=p_current_class
                best_class=current_class
            first_run=False
        # print("best_class=",best_class,"  best prob=",best_p)
        return best_class

    def predict(self,x_test):
        y_per=[]
        x_test=[self.get_clean_words(x) for x in x_test]
        x_test=self.gen_data(x_test)
        x_test=np.array(x_test)
        for x in x_test:
            x_class=self.predictSinglePoint(x)
            y_per.append(x_class)
        return y_per

    def get_dictionary(self):
        return self.dictionary

    def gen_dictionary(self,x_train,y_train):
        x_train=np.array(x_train)
        y_train=np.array(y_train)
        self.dictionary={}
        class_values=set(y_train)
        
        feature_count=self.features.shape[0]
        for current_class in class_values:
            self.dictionary["total_data"]=len(y_train)
            self.dictionary[current_class]={}
            current_class_rows=(y_train==current_class)
            x_train_current=x_train[current_class_rows]
            y_train_current=y_train[current_class_rows]
            self.dictionary[current_class]["total_count"]=len(y_train_current)
            for j in range(feature_count):
                self.dictionary[current_class][j+1]={}
                all_possible_values=(0,1,2,3) #set(x_train[:,j])
                for current_value in all_possible_values:
                    self.dictionary[current_class][j+1][current_value]=x_train_current[x_train_current[:,j]==current_value].sum()
        print(self.dictionary)

    def makelabelled(self,column):
        row_count=len(column)
        second_limit=column.mean()
        first_limit=0.5*second_limit
        third_limit=1.5*second_limit
        for i in range(row_count):
            if(column[i]<first_limit):
                column[i]=0
            elif(column[i]<second_limit):
                column[i]=1
            elif(column[i]<third_limit):
                column[i]=2
            else:
                column[i]=3
        return column

    def count(self,feature,x):
        c=0
        for word in x:
            if word==feature:
                c+=1
        return c

    def gen_data(self,data):
        data_new=[]
        for x in data:
            x_new=[]
            for feature in self.features:
                x_new.append(self.count(feature,x))
            data_new.append(x_new)
        data_new=np.array(data_new)
        temp=[]
        for i in range(len(self.features)):
            # print("                    ",data_new[:,i])
            k=data_new[:,i]
            # print(k)
            temp.append(self.makelabelled(k))
        temp=np.array(temp)
        data_new=temp.T
        return data_new

    def get_simple_pos(self,tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADJ
        else:
            return wordnet.NOUN

    def get_clean_words(self,words):
        lemmatizer=WordNetLemmatizer()
        clean_words=[]
        for w in words:
            if w not in self.stop_words:
                tag=pos_tag([w])
                word=lemmatizer.lemmatize(w,self.get_simple_pos(tag[0][1]))
                clean_words.append(word.lower())
        return clean_words
    
    def generate_features(self,data):
        data=np.array(data)
        self.features=[]
        frequency_list=[]
        l=len(data)
        for i in range(l):
            text=data[i]
            # text=str(text)
            words,frequency=np.unique(text,return_counts=True)
            l1=len(frequency)
            for j in range(l1):
                if(frequency[j]>6):
                    self.features.append(words[j])
        self.features=list(dict.fromkeys(self.features))

    def fit(self,x_train,y_train):
        x_train=[self.get_clean_words(x) for x in x_train]
        all_words=[]
        # for words in x_train:
        #     all_words+=words
        # freq=FreqDist(all_words)
        # common=freq.most_common(300)
        # for i in common:
        #     if i[0].lower() not in self.stop_words:
        #         self.features.append(i[0])
        # self.features=np.array(self.features)
        self.generate_features(x_train)
        self.features=np.array(self.features)
        x_train=self.gen_data(x_train)
        print(x_train.shape)
        self.gen_dictionary(x_train,y_train)
        

In [115]:
x_train=x[:1500]
y_train=y[:1500]

x_test=x[1500:]
y_test=y[1500:]

In [116]:
nlp=NLP()
nlp.fit(x_train,y_train)
dictionary=nlp.get_dictionary()

  data=np.array(data)


(1500, 1234)
{'total_data': 1500, 0: {'total_count': 756, 1: {0: 43330, 1: 44185, 2: 29593, 3: 36324}, 2: {0: 152581, 1: 0, 2: 0, 3: 851}, 3: {0: 141964, 1: 0, 2: 0, 3: 11468}, 4: {0: 56145, 1: 35884, 2: 25071, 3: 36332}, 5: {0: 30605, 1: 33232, 2: 30371, 3: 59224}, 6: {0: 137701, 1: 0, 2: 0, 3: 15731}, 7: {0: 62919, 1: 32266, 2: 24305, 3: 33942}, 8: {0: 45529, 1: 38945, 2: 26051, 3: 42907}, 9: {0: 98984, 1: 0, 2: 0, 3: 54448}, 10: {0: 141509, 1: 0, 2: 0, 3: 11923}, 11: {0: 142135, 1: 0, 2: 0, 3: 11297}, 12: {0: 139017, 1: 0, 2: 0, 3: 14415}, 13: {0: 66416, 1: 0, 2: 48044, 3: 38972}, 14: {0: 105254, 1: 0, 2: 14424, 3: 33754}, 15: {0: 98947, 1: 0, 2: 0, 3: 54485}, 16: {0: 153191, 1: 0, 2: 0, 3: 241}, 17: {0: 151763, 1: 0, 2: 0, 3: 1669}, 18: {0: 69303, 1: 0, 2: 46447, 3: 37682}, 19: {0: 152884, 1: 0, 2: 0, 3: 548}, 20: {0: 137696, 1: 0, 2: 0, 3: 15736}, 21: {0: 136583, 1: 0, 2: 0, 3: 16849}, 22: {0: 27715, 1: 34942, 2: 33190, 3: 57585}, 23: {0: 152868, 1: 0, 2: 0, 3: 564}, 24: {0: 37427

In [117]:
y_pred=nlp.predict(x_test)

current_class= 0  ,  prob= 4813.473760808131
current_class= 1  ,  prob= 5101.807634387496
current_class= 0  ,  prob= 4795.910699353783
current_class= 1  ,  prob= 5067.276259843702
current_class= 0  ,  prob= 4812.349042667719
current_class= 1  ,  prob= 5097.442487906808
current_class= 0  ,  prob= 4756.203355251467
current_class= 1  ,  prob= 5034.127523050496
current_class= 0  ,  prob= 4767.447548561247
current_class= 1  ,  prob= 5052.408813229045
current_class= 0  ,  prob= 4808.529473489782
current_class= 1  ,  prob= 5076.528932171403
current_class= 0  ,  prob= 4702.270667486884
current_class= 1  ,  prob= 4982.66683599861
current_class= 0  ,  prob= 4831.244394140987
current_class= 1  ,  prob= 5109.170068689683
current_class= 0  ,  prob= 4859.566235795123
current_class= 1  ,  prob= 5146.471601056856
current_class= 0  ,  prob= 4809.206725498301
current_class= 1  ,  prob= 5085.782718375164
current_class= 0  ,  prob= 4735.898659747278
current_class= 1  ,  prob= 5015.566124144712
current_cla

In [118]:
from sklearn import naive_bayes
clf=naive_bayes()

TypeError: 'module' object is not callable

[[  0 226]
 [  0 274]]


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       226
           1       0.55      1.00      0.71       274

    accuracy                           0.55       500
   macro avg       0.27      0.50      0.35       500
weighted avg       0.30      0.55      0.39       500



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
