In [None]:
# import statements
import pandas as pd
import numpy as np
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.data.path.append("C:\\Users\\hp\\AppData\\Roaming\\nltk_data")


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer

## using vader
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
data = pd.read_csv(r'C:\Users\hp\Documents\1. Braimah ReBirth\customer-feedback-sentiment\dataset\ecommerce reviews.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   labels  400000 non-null  object
 1   text    400000 non-null  object
dtypes: object(2)
memory usage: 6.1+ MB


In [4]:
data.head(3)

Unnamed: 0,labels,text
0,__label__2,Great CD: My lovely Pat has one of the GREAT v...
1,__label__2,One of the best game music soundtracks - for a...
2,__label__1,Batteries died within a year ...: I bought thi...


In [5]:
data.iloc[7]

labels                                           __label__1
text      DVD menu select problems: I cannot scroll thro...
Name: 7, dtype: object

In [6]:
print(data.iloc[7]['text'])

DVD menu select problems: I cannot scroll through a DVD menu that is set up vertically. The triangle keys will only select horizontally. So I cannot select anything on most DVD's besides play. No special features, no language select, nothing, just play.


In [7]:
data.columns

Index(['labels', 'text'], dtype='object')

In [8]:
data['labels'].unique()

array(['__label__2', '__label__1'], dtype=object)

In [9]:
# mapping postivie and negative labels

map_values= {
    '__label__1': 'negative', 
    '__label__2': 'positive'
}

In [10]:
data['labels'] = data['labels'].map(map_values)

In [11]:
data.head(10)

Unnamed: 0,labels,text
0,positive,Great CD: My lovely Pat has one of the GREAT v...
1,positive,One of the best game music soundtracks - for a...
2,negative,Batteries died within a year ...: I bought thi...
3,positive,"works fine, but Maha Energy is better: Check o..."
4,positive,Great for the non-audiophile: Reviewed quite a...
5,negative,DVD Player crapped out after one year: I also ...
6,negative,"Incorrect Disc: I love the style of this, but ..."
7,negative,DVD menu select problems: I cannot scroll thro...
8,positive,Unique Weird Orientalia from the 1930's: Exoti...
9,negative,"Not an ""ultimate guide"": Firstly,I enjoyed the..."


In [12]:
# spiltting sample into train and test

train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

## TEXT PREPROCESSING

In [13]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
stop_words= stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [14]:
text = "This is a sample sentence, showing off the stop words filtration."
word_tokens = nltk.word_tokenize(text)

In [15]:
word_tokens

['This',
 'is',
 'a',
 'sample',
 'sentence',
 ',',
 'showing',
 'off',
 'the',
 'stop',
 'words',
 'filtration',
 '.']

In [16]:
filtered = [word for word in word_tokens if word.lower() not in stop_words]

In [17]:
# Join list back to string
filtered_text = " ".join(filtered) 

In [18]:
filtered_text

'sample sentence , showing stop words filtration .'

In [19]:
# function to remove stopwords

def remove_stopwords(text):
    """" 
    Takes sentence as input, tokenizes and eliminates stopwords
    """
    word_tokens = nltk.word_tokenize(text)
    filtered = [word for word in word_tokens if word.lower() not in stop_words]
    filtered_text = " ".join(filtered) 

    return filtered_text

In [20]:
train_df['text'].head(10).apply(remove_stopwords)

242245    Mediocre : fake velvet touch enhances cheap to...
288918    Downton Abbey : love love love . would recomen...
105103    Roses Red : Disappointed one . stretch get eno...
63504     Awful Experience : Microsoft mouse stopped wor...
239180    must seen different movie : one word movie . S...
11328     Pretentious long-winded : alternate title book...
386155    Horrible unit - stay away : 2 units , sportste...
37244     Awful ! : admit , like Celtic Woman . want acc...
8960      Recommended cleaning resulted broken product :...
143909    Really 3.5 Stars : average spy thriller good F...
Name: text, dtype: object

In [21]:
test_df

Unnamed: 0,labels,text
23218,positive,This is a great book: I must preface this by s...
20731,negative,"Huge Disappointment.: As a big time, long term..."
39555,positive,Wayne is tight but cant hang with Turk.: This ...
147506,positive,Excellent: I read this book when I was in elem...
314215,negative,Not about Anusara: Although this book is toute...
...,...,...
54840,positive,Enjoyable Series: This was a Christmas present...
103,negative,Used for circuits project: I used the generato...
339434,negative,don't waste your money or time: I purchased th...
315677,negative,A dull and deeply prosaic book: The author's i...


In [22]:
tqdm.pandas(total=len(train_df))
train_df['filtered_text'] = train_df['text'].progress_apply(remove_stopwords)

100%|██████████| 320000/320000 [07:00<00:00, 761.75it/s] 


In [None]:
tqdm.pandas(total=len(test_df))
test_df['filtered_text'] = test_df['text'].progress_apply(remove_stopwords)

100%|██████████| 80000/80000 [01:50<00:00, 722.09it/s]


In [24]:
## bag of words
vect= CountVectorizer()
train_bog = vect.fit_transform(train_df['filtered_text'])

test_bog = vect.transform(test_df['filtered_text'])

In [25]:
## tfidf

tfidf = TfidfVectorizer()
train_tfidf = vect.fit_transform(train_df['filtered_text'])

test_tfidf = vect.transform(test_df['filtered_text'])

 # Modelling

In [None]:
# using Vader for Sentiment
# function to apply vader

def vader_sentiment(text):
    ''' 
    Takes in comments and returns the sentiment class
    '''
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(text)
    compound_score = sentiment['compound']

    if compound_score >0:
        return "Positive"
    else:
        return "Negative"
    


In [28]:
train_df.columns

Index(['labels', 'text', 'filtered_text'], dtype='object')

In [31]:
 # Applying vader on raw comments
test_df['v_text'] = test_df['text'].apply(vader_sentiment)

# Applying vader on filtered text
test_df['v_filtered_text'] = test_df['filtered_text'].apply(vader_sentiment)

In [None]:
# make lowercase
test_df = test_df.astype(str).apply(lambda x: x.str.lower())

In [33]:
# training custom model on BOGs and TFIDF

classifier_bow = MultinomialNB()
classifier_tfidf = MultinomialNB()


# fitting the training data
classifier_bow.fit(train_bog, train_df['labels'])
classifier_tfidf.fit(train_tfidf, train_df['labels'])


In [57]:
# predict

test_df['bog'] = classifier_bow.predict(test_bog)
test_df['tfidf'] = classifier_tfidf.predict(test_tfidf)

In [58]:
test_df.head()

Unnamed: 0,labels,text,filtered_text,v_text,v_filtered_text,bog,tfidf
23218,positive,This is a great book: I must preface this by s...,great book : must preface saying religious - l...,positive,positive,positive,positive
20731,negative,"Huge Disappointment.: As a big time, long term...","Huge Disappointment . : big time , long term T...",negative,negative,negative,negative
39555,positive,Wayne is tight but cant hang with Turk.: This ...,Wayne tight cant hang Turk . : album hot wants...,positive,positive,positive,positive
147506,positive,Excellent: I read this book when I was in elem...,Excellent : read book elementary school- proba...,positive,positive,positive,positive
314215,negative,Not about Anusara: Although this book is toute...,Anusara : Although book touted several Anusara...,negative,negative,negative,negative


In [59]:
test_df.columns

Index(['labels', 'text', 'filtered_text', 'v_text', 'v_filtered_text', 'bog',
       'tfidf'],
      dtype='object')

In [60]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80000 entries, 23218 to 164567
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   labels           80000 non-null  object
 1   text             80000 non-null  object
 2   filtered_text    80000 non-null  object
 3   v_text           80000 non-null  object
 4   v_filtered_text  80000 non-null  object
 5   bog              80000 non-null  object
 6   tfidf            80000 non-null  object
dtypes: object(7)
memory usage: 4.9+ MB


In [61]:
# Model Evaluation

def evaluate_sentiment(test_df):
    target_col = 'labels'
    other_cols = ['v_text', 'v_filtered_text', 'bog', 'tfidf']

    results = {}
    for col in other_cols:
        #test_df[col] = test_df[col].astype(str).apply(lambda x: x.lower() if isinstance(x, str) else x)
        #str.lower()
        #map({'Positive': 'positive', 'Negative': 'negative'})
        acc = accuracy_score(test_df[target_col], test_df[col])
        report = classification_report(test_df[target_col], test_df[col])
        results[col]= {'Accuracy': acc, 'Report': report}

    return results

In [62]:
results = evaluate_sentiment(test_df)
results

{'v_text': {'Accuracy': 0.7172375,
  'Report': '              precision    recall  f1-score   support\n\n    negative       0.87      0.51      0.64     39896\n    positive       0.65      0.92      0.77     40104\n\n    accuracy                           0.72     80000\n   macro avg       0.76      0.72      0.70     80000\nweighted avg       0.76      0.72      0.70     80000\n'},
 'v_filtered_text': {'Accuracy': 0.6812125,
  'Report': '              precision    recall  f1-score   support\n\n    negative       0.87      0.43      0.57     39896\n    positive       0.62      0.93      0.75     40104\n\n    accuracy                           0.68     80000\n   macro avg       0.74      0.68      0.66     80000\nweighted avg       0.74      0.68      0.66     80000\n'},
 'bog': {'Accuracy': 0.8469875,
  'Report': '              precision    recall  f1-score   support\n\n    negative       0.84      0.86      0.85     39896\n    positive       0.85      0.84      0.85     40104\n\n    a

In [63]:
for col, metrics in results.items():
    print(f'Results for {col}:')
    print(f"Accuracy: {metrics['Accuracy']}")
    print('Classification Report:')
    print(metrics['Report'])
    print('-'* 50)

Results for v_text:
Accuracy: 0.7172375
Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.51      0.64     39896
    positive       0.65      0.92      0.77     40104

    accuracy                           0.72     80000
   macro avg       0.76      0.72      0.70     80000
weighted avg       0.76      0.72      0.70     80000

--------------------------------------------------
Results for v_filtered_text:
Accuracy: 0.6812125
Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.43      0.57     39896
    positive       0.62      0.93      0.75     40104

    accuracy                           0.68     80000
   macro avg       0.74      0.68      0.66     80000
weighted avg       0.74      0.68      0.66     80000

--------------------------------------------------
Results for bog:
Accuracy: 0.8469875
Classification Report:
              precision    recall  f1-score   sup

In [66]:
# save model for future use

import pickle

with open('tfidf_sentiment.pkl', 'wb') as f:
    pickle.dump(classifier_tfidf,f)