In [3]:
import numpy as np 
import pandas as pd 
import bz2
import gc
import re
import os

# Read & Preprocess data

In [4]:
train_file = bz2.BZ2File('./amazonreviews_dataset/7/train.ft.txt.bz2')
test_file = bz2.BZ2File('./amazonreviews_dataset/7/test.ft.txt.bz2')

## Create Lists containing Train & Test sentences

In [5]:
train_file_lines = train_file.readlines()
test_file_lines = test_file.readlines()

In [6]:
del train_file, test_file
gc.collect()

972

## Convert from raw binary strings to strings that can be parsed

In [7]:
train_file_lines = [x.decode('utf-8') for x in train_file_lines]
test_file_lines = [x.decode('utf-8') for x in test_file_lines]

In [8]:
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file_lines]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file_lines]

for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])
    
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file_lines]
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file_lines]

for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])
                                                       
for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
        
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

In [9]:
del train_file_lines, test_file_lines
train_sentences[0]

'stuning even for the non-gamer: this sound track was beautiful! it paints the senery in your mind so well i would recomend it even to people who hate vid. game music! i have played the game chrono cross but out of all of the games i have ever played it has the best music! it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. it would impress anyone who cares to listen! ^_^'

In [10]:
gc.collect()

0

# 1: Using NLTK Naive Bayes

In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier

from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output

In [12]:
# Create train and test dataframes
Na_train = {'Sentence': train_sentences, 'Label': train_labels}
Nav_train = pd.DataFrame(Na_train)

Na_test = {'Sentence': test_sentences, 'Label': test_labels}
Nav_test = pd.DataFrame(Na_test)

Nav_train.head()

Nav_train = Nav_train.head(900)
Nav_test = Nav_test.head(100)



### Separate Positive and Negative tweets

In [13]:
#train_pos = Nav_train[Nav_train['Label'] == 1]
#train_pos = Nav_train['Sentence']
#train_neg = Nav_train[Nav_train['Label'] == 0]
#train_neg = Nav_train['Sentence']

test_pos = Nav_test[Nav_test['Label'] == 1]
test_pos = Nav_test['Sentence']
test_neg = Nav_test[Nav_test['Label'] == 0]
test_neg = Nav_test['Sentence']

In [14]:
del Na_train, Na_test, train_sentences, train_labels
gc.collect()

0


## Cleaning and Feature Extraction

In [15]:
                
sents = []
alll = []
stopwords_set = set(stopwords.words("english"))

for index, row in Nav_train.iterrows():
    words_filtered = [e.lower() for e in row.Sentence.split() if len(e) >= 3]
    words_cleaned = [word for word in words_filtered
        if 'http' not in word
        and not word.startswith('@')
        and not word.startswith('#')
        and word != 'RT']
    words_without_stopwords = [word for word in words_cleaned if not word in stopwords_set]
    sents.append((words_without_stopwords, row.Label))
    alll.extend(words_without_stopwords )
    

-# Extracting word features
def get_words_in_tweets(tweets):
    alll = []
    for (words, sentiment) in tweets:
        alll.extend(words)
    return alll


In [16]:
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    features = wordlist.keys()
    return features


In [17]:
#teee = get_words_in_tweets(sents)


In [18]:

w_features = get_word_features(alll)
# TESTING BELOW


In [19]:

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in w_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [20]:
# Training the Naive Bayes classifier
training_set = nltk.classify.apply_features(extract_features,sents)


In [21]:
classifier = nltk.NaiveBayesClassifier.train(training_set)


In [22]:
train_pos = Nav_train[Nav_train['Label'] == 1]
train_pos = train_pos['Sentence']
train_neg = Nav_train[Nav_train['Label'] == 0]
train_neg = train_neg['Sentence']
test_pos = Nav_test[Nav_test['Label'] == 1]
test_pos = test_pos['Sentence']
test_neg = Nav_test[Nav_test['Label'] == 0]
test_neg = test_neg['Sentence']

In [23]:
test_neg.head(40)

2     batteries died within a year ...: i bought thi...
5     dvd player crapped out after one year: i also ...
6     incorrect disc: i love the style of this, but ...
7     dvd menu select problems: i cannot scroll thro...
9     not an "ultimate guide": firstly,i enjoyed the...
11    not!: if you want to listen to el duke , then ...
12    a complete bust: this game requires quicktime ...
14    didn't run off of usb bus power: was hoping th...
15    don't buy!: first of all, the company took my ...
20    long and boring: i've read this book with much...
21    dont like it: this product smells when you ope...
24    don't take the chance - get the se branded cab...
25    waste of money!: like many of the barbie cd ro...
27    has no range: i suppose if you were going to s...
29    three days of use and it broke: very disappoin...
35    not as expected...: my children get easily bor...
37    doublecharged for shipping because merchant wa...
39    light reading, light in substance: a clich

In [29]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, classification_report

# Initialize lists to store true and predicted labels
y_true = []  # True labels (0 for negative, 1 for positive)
y_pred = []  # Predicted labels

# Evaluate on negative test data
for obj in test_neg:
    res = classifier.classify(extract_features(obj.split()))
    y_true.append(0)  # True label for negative class
    y_pred.append(res)  # Predicted label

# Evaluate on positive test data
for obj in test_pos:
    res = classifier.classify(extract_features(obj.split()))
    y_true.append(1)  # True label for positive class
    y_pred.append(res)  # Predicted label

# Calculate metrics
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)

# Classification Report
class_report = classification_report(y_true, y_pred, digits=4)

# Print results
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)


Precision: 0.8888888888888888
Recall: 0.7547169811320755
F1-Score: 0.8163265306122449
Accuracy: 0.82
Confusion Matrix:
 [[42  5]
 [13 40]]


In [33]:
print(f"Accuracy: {accuracy}")

Accuracy: 0.82


In [30]:
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[42  5]
 [13 40]]


In [31]:
print("Classification Report:\n\n", class_report)

Classification Report:

               precision    recall  f1-score   support

           0     0.7636    0.8936    0.8235        47
           1     0.8889    0.7547    0.8163        53

    accuracy                         0.8200       100
   macro avg     0.8263    0.8242    0.8199       100
weighted avg     0.8300    0.8200    0.8197       100



In [32]:
print(test_neg.loc[52])
classifier.classify(extract_features(test_neg.loc[52].split()))

00 y/o potty humor from 00 somethings does not a comedy make....: saw this movie and felt compelled to warn people not to waste 000 agonizing minutes of their lives. in a nutshell, i have heard many more clever jokes and "one-liners" watching king of queens on cable.....seriously folks, it's that bad....the bad jokes and the 00 references to small jewish penises got old real quick. thought i would always be a sandler fan but he clearly, adam was in need of some rent money and jumped on this script for the cash and ran!why else would the plot line include seth rogan writing for him??? to put things into perspective, this film made bruno look like casablanca!!


0