In [12]:
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhij\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhij\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

In [14]:
vectorizers = [
    ("TfidfVectorizer", TfidfVectorizer()),
    ("CountVectorizer", CountVectorizer())
]

classifiers = [
    ("Multinomial Naive Bayes", MultinomialNB()),
    ("Logistic Regression", LogisticRegression()),
    ("Random Forest", RandomForestClassifier()),
    ("SVM", SVC()),
    ("KNN", KNeighborsClassifier())
]


In [15]:
file_path = 'D:\\BSES - Data Analyst\\Sentiment Analysis\\Data\\Twitter_Comments.xlsx'
df = pd.read_excel(file_path, engine='openpyxl')
df['Customer_Text'] = df['Customer_Text'].astype(str)
df

Unnamed: 0,Customer_Text,Department,Complaint_Type,BSES_Text,Sentiment
0,i am very shocked due to my electricity bill.....,Business,"Bill Related (Bill Download Issue, Wrong Bill)",sorry for the inconvenience caused. please pro...,Negative
1,can we know about the new base/guidelines abou...,Business,"Bill Related (Bill Download Issue, Wrong Bill)",it would be great if you could share your ca a...,Negative
2,\nit is third day of low voltage fluctuation i...,O&M,Voltage Fluctuation (Low & High),\nwe're getting this checked and will get back...,Negative
3,à¤à¤• à¤¦à¤® à¤®à¤œà¤¾à¤• à¤¬à¤¨à¤¾ à¤¦à¤¿à¤...,O&M,Power Outage,hey your complaint has been already registered...,Negative
4,à¤à¤• à¤¦à¤® à¤®à¤œà¤¾à¤• à¤¬à¤¨à¤¾ à¤¦à¤¿à¤...,O&M,Power Outage,we'll surely look into this for you. be assure...,
...,...,...,...,...,...
50148,"here,s no electrical supply qhy you cut the el...",O&M,Power Outage,sorry for the inconvenience caused. please pro...,Negative
50149,no power for ca no 151791523,O&M,Power Outage,it would be great if you could share your cont...,Negative
50150,no electricity in our area my ca no is 101424877,O&M,Power Outage,sorry for the inconvenience caused. we have no...,Negative
50151,still no update\nwhen would be power restored,O&M,Power Outage,sorry for the inconvenience caused. please pro...,Negative


In [16]:
df.dropna(inplace=True)
df.drop(["Department", "Complaint_Type ", "BSES_Text"], axis='columns', inplace=True)
df['Customer_Text'] = df['Customer_Text'].apply(preprocess_text)
df

Unnamed: 0,Customer_Text,Sentiment
0,shocked due electricity bill last month bill r...,Negative
1,know new baseguidelines electricity bill getti...,Negative
2,third day low voltage fluctuation lane registe...,Negative
3,,Negative
5,30620 lab test must done please expedite revis...,Negative
...,...,...
50148,heres electrical supply qhy cut electricity ki...,Negative
50149,power ca 151791523,Negative
50150,electricity area ca 101424877,Negative
50151,still update would power restored,Negative


In [22]:
stop_words = set(stopwords.words('english'))
X_train, X_test, y_train, y_test = train_test_split(df['Customer_Text'], df['Sentiment'], test_size=0.3, random_state=42)
test = "yes, the meter has been removed.  thanks for your co-operation."

In [23]:
for vec_name, vectorizer in vectorizers:
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    for clf_name, clf in classifiers:
        clf.fit(X_train_vectorized, y_train)

        test_feats = vectorizer.transform([test])
        predicted_sentiment = clf.predict(test_feats)
        print("Predicted Sentiment:", predicted_sentiment)

        accuracy = clf.score(X_test_vectorized, y_test)
        print(f"{clf_name} + {vec_name}: Accuracy = {accuracy}")

Predicted Sentiment: ['Negative']
Multinomial Naive Bayes + TfidfVectorizer: Accuracy = 0.9824626519439548
Predicted Sentiment: ['Negative']
Logistic Regression + TfidfVectorizer: Accuracy = 0.9871021620116915
Predicted Sentiment: ['Negative']
Random Forest + TfidfVectorizer: Accuracy = 0.9893291268442053
Predicted Sentiment: ['Negative']
SVM + TfidfVectorizer: Accuracy = 0.9884940150320126
Predicted Sentiment: ['Neutral']
KNN + TfidfVectorizer: Accuracy = 0.6218799294794469
Predicted Sentiment: ['Negative']
Multinomial Naive Bayes + CountVectorizer: Accuracy = 0.9820914911385358
Predicted Sentiment: ['Negative']
Logistic Regression + CountVectorizer: Accuracy = 0.9859886795954347
Predicted Sentiment: ['Negative']
Random Forest + CountVectorizer: Accuracy = 0.9886795954347221
Predicted Sentiment: ['Negative']
SVM + CountVectorizer: Accuracy = 0.9846896167764684
Predicted Sentiment: ['Negative']
KNN + CountVectorizer: Accuracy = 0.9741115338220284
