In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# preprocessing the data

def preprocess_sentence(sentence):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    sentence = str(sentence)
    # Tokenize the sentence
    tokens = word_tokenize(sentence.lower())
    
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stem each word
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    # Lemmatize each word
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    
    # Join the tokens back into a sentence
    cleaned_sentence = ' '.join(lemmatized_tokens)
    
    return cleaned_sentence


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# Reading the csv file 

df = pd.read_csv("Review2.csv")
print(df.head())
print("Row Count: ")
print(df.shape[0])


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Row Count: 
14075


In [None]:
# preprocessing the data

df = df[["review", "sentiment"]]
print(df.head())

print(df["review"].dtype)
print(df["sentiment"].unique())

df["review"] = df["review"].apply(preprocess_sentence)

print("Data after Preprocessing: ")
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
object
['positive' 'negative']
Data after Preprocessing: 
                                              review sentiment
0  one review mention watch 1 oz episod 'll hook ...  positive
1  wonder littl product . < br / > < br / > film ...  positive
2  thought wonder way spend time hot summer weeke...  positive
3  basic 's famili littl boy ( jake ) think 's zo...  negative
4  petter mattei 's `` love time money '' visual ...  positive


In [None]:
# using Baye's Model to Predict

# dividing the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(df["review"], df["sentiment"], test_size=0.2, random_state=42)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
print("X_train vectorized: ")
print(X_train)

X_test = vectorizer.transform(X_test)
print("X_train vectorized: ")
print(X_test)

# Train a Naive Bayes classifier on the training data
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Evaluate the performance of the classifier on the testing data
accuracy = nb.score(X_test, y_test)
# precision = nb.score(X_test, y_test == 1)
print("Accuracy:", accuracy)
# print("Precision:", precision)

X_train vectorized: 
  (0, 31990)	1
  (0, 13398)	3
  (0, 20941)	1
  (0, 25476)	1
  (0, 4959)	18
  (0, 41005)	1
  (0, 38019)	1
  (0, 26234)	2
  (0, 21157)	1
  (0, 21378)	1
  (0, 30884)	1
  (0, 22425)	1
  (0, 14397)	1
  (0, 34331)	1
  (0, 27060)	1
  (0, 14031)	1
  (0, 17701)	1
  (0, 26097)	1
  (0, 30855)	1
  (0, 32629)	1
  (0, 13519)	1
  (0, 26170)	1
  (0, 34851)	1
  (0, 1756)	2
  (0, 8463)	2
  :	:
  (11259, 38931)	1
  (11259, 41103)	1
  (11259, 11129)	1
  (11259, 26885)	1
  (11259, 29214)	4
  (11259, 18776)	1
  (11259, 34648)	1
  (11259, 22182)	1
  (11259, 25561)	1
  (11259, 22362)	1
  (11259, 15608)	1
  (11259, 32840)	1
  (11259, 18329)	1
  (11259, 34321)	1
  (11259, 34939)	1
  (11259, 40060)	1
  (11259, 26662)	1
  (11259, 27610)	1
  (11259, 33286)	1
  (11259, 2289)	1
  (11259, 31612)	1
  (11259, 18230)	1
  (11259, 12594)	1
  (11259, 26566)	1
  (11259, 22697)	1
X_train vectorized: 
  (0, 2051)	1
  (0, 2530)	1
  (0, 4006)	1
  (0, 4123)	1
  (0, 4474)	1
  (0, 4959)	8
  (0, 6511)	1
  (0, 1

In [58]:
# using my_model to predict the reviews

# the method Used:  score = Summation of(1 + log(tf)) and then take the average
df_working = df.head(1000)

def basemodel(r):
    rs = 42  + 2 * r
    X_train, X_test, y_train, y_test = train_test_split(df_working['review'], df_working['sentiment'], test_size=0.2, random_state=rs)
    # evaluting the vocabulary
    vocab = set()
    for sentence in X_train:
        words = sentence.split()
        for word in words:
            vocab.add(word)

    # Create a dictionary to store the count of each word in each class
    # array of index = word for all labels
    # count how many time a word occured in each labels

    class_word_counts = {}
    for c in np.unique(y_train):
        class_word_counts[c] = {}
        for word in vocab:
            class_word_counts[c][word] = 0

    # Count the number of occurrences of each word in each class
    for i in range(len(X_train)):
        words = X_train.iloc[i].split()
        c = y_train.iloc[i]
        for word in words:
            class_word_counts[c][word] += 1

    doc_freq = {}

    for word in vocab:
      doc_freq[word] = 0
      for c in np.unique(y_train):
        if class_word_counts[c][word] != 0:
          doc_freq[word] += 1


    # Compute the total count of words in each class
    class_word_totals = {}
    for c in np.unique(y_train):
        class_word_totals[c] = sum(class_word_counts[c].values())

    # Define a function to predict the class of a new text sample
    def predict(text):
        text = preprocess_sentence(text)
        words = text.split()
        probs = {}
        avg_score = 0
        for c in np.unique(y_train):
            score = 0
            for word in words:
              if word in vocab:
                if class_word_counts[c][word] != 0:
                  score += (1 + np.log(class_word_counts[c][word]))
                else:
                  score += 0

            probs[c] = score
            avg_score += score
        return max(probs, key=probs.get)

    # Evaluate the performance of the classifier on the testing data
    correct = 0
    hate_correct = 0
    for i in range(len(X_test)):
        pred = predict(X_test.iloc[i])
        if pred == y_test.iloc[i]:
            correct += 1
            if pred == 'negative':
              hate_correct += 1
    accuracy = correct / len(y_test)
    precision = hate_correct / (len(y_test == 'negative'))
    print("*************************************************")
    print("Test no: ", end="")
    print(r)
    print("Accuracy:", accuracy)
    print("precision:", precision)
    print("test size: ", len(y_test))
    print("correct: ", correct)
    print("Total negative Reviews:", len(y_test == 'negative'))
    print("detected negative Reviews:", hate_correct)
    print("*************************************************")
    return accuracy, precision


avg_accuracy = 0
avg_precision = 0
for i in range(10):
  accuracy, precision = basemodel(i)
  avg_accuracy += accuracy
  avg_precision += precision

print("average accuracy:", avg_accuracy / 10)
print("average precision:", avg_precision/ 10)

*************************************************
Test no: 0
Accuracy: 0.805
precision: 0.41
test size:  200
correct:  161
Total negative Reviews: 200
detected negative Reviews: 82
*************************************************
*************************************************
Test no: 1
Accuracy: 0.775
precision: 0.385
test size:  200
correct:  155
Total negative Reviews: 200
detected negative Reviews: 77
*************************************************
*************************************************
Test no: 2
Accuracy: 0.72
precision: 0.395
test size:  200
correct:  144
Total negative Reviews: 200
detected negative Reviews: 79
*************************************************
*************************************************
Test no: 3
Accuracy: 0.735
precision: 0.35
test size:  200
correct:  147
Total negative Reviews: 200
detected negative Reviews: 70
*************************************************
*************************************************
Test no: 4
Accuracy: 0.69