In [25]:
import nltk
import re
import difflib
import spacy
import pandas as pd
from textblob import TextBlob
from nltk.corpus import words
from difflib import get_close_matches

In [40]:
def clean_text(text):
    text = text.replace('\n', ' ')
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def analyze_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity > 0:
        return 1
    elif polarity == 0:
        return 0
    else:
        return -1

def clean_digits(text):
    num_digits = sum(c.isdigit() for c in text)
    return (num_digits / len(text)) <= 0.5

In [46]:
file_path = 'D:\\BSES - Data Analyst\\Sentiment Analysis\\Data\\SentimentAnalysis_Data.xlsx'

df = pd.read_excel(file_path, engine='openpyxl')
df = df.drop(["Date", "Day", "Complaint Posting Time"], axis=1)

In [42]:
df.dropna(inplace=True)
df['Customer_Text'] = df['Customer_Text'].astype(str)
df['Customer_Text'] = df['Customer_Text'].apply(clean_text)
df = df[df['Customer_Text'].str.strip() != '']
df = df[df['Customer_Text'].apply(len) >= 3]
df = df[~df['Customer_Text'].str.isdigit()]
df = df[df['Customer_Text'].str.split().apply(len) > 1]
df = df[df['Customer_Text'].apply(clean_digits)]

df

Unnamed: 0,Customer_Text
0,are you returning extra security or not reply ...
1,no power in sonia vihar area last 3 hrs
2,whats the fuck is going on every day power cut...
3,bses please restore power supply ca no is 1527...
4,no suppy 101350118
...,...
16508,heres no electrical supply qhy you cut the ele...
16509,no power for ca no 151791523
16510,no electricity in our area my ca no is 101424877
16511,still no update when would be power restored


# Hinglish Words

In [31]:
nltk.download('words')
nltk.download('punkt')
english_words = set(words.words())

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\abhij\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhij\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
unknown_words = []
for row in df["Customer_Text"]:
    tokens = nltk.word_tokenize(row)
    for word in tokens:
        if word.lower() not in english_words and re.match("^[a-zA-Z]+$", word):
            unknown_words.append(word)

unknown_df = pd.DataFrame(unknown_words, columns=['Unknown_Words'])
unknown_df = unknown_df.drop_duplicates()
unknown_df = unknown_df[unknown_df["Unknown_Words"].apply(len) >= 3]
unknown_df.reset_index(drop=True, inplace=True)
unknown_df.to_csv('D:\\BSES - Data Analyst\\Unknown_words.csv', index=False)
unknown_df

Unnamed: 0,Unknown_Words
0,returning
1,sonia
2,vihar
3,hrs
4,fuck
...,...
4717,abhimanyu
4718,audama
4719,chaturvedi
4720,qhy


# Creating True data Using TextBlob

In [43]:
df['Sentiment'] = df['Customer_Text'].apply(analyze_sentiment)
df

Unnamed: 0,Customer_Text,Sentiment
0,are you returning extra security or not reply ...,0
1,no power in sonia vihar area last 3 hrs,0
2,whats the fuck is going on every day power cut...,-1
3,bses please restore power supply ca no is 1527...,0
4,no suppy 101350118,0
...,...,...
16508,heres no electrical supply qhy you cut the ele...,1
16509,no power for ca no 151791523,0
16510,no electricity in our area my ca no is 101424877,0
16511,still no update when would be power restored,0


In [44]:
df.to_csv('D:\\BSES - Data Analyst\\Sentiment Analysis\\Data\\true_data_test.csv', index=False)