# 📚 Imports
---

In [None]:
!pip install textblob

In [None]:
import pandas as pd
import numpy as np
import re
from unidecode import unidecode

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import collections

from textblob import TextBlob

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# 📖 Data
---

In [None]:
df = pd.read_csv('/kaggle/input/chat-sentiment-dataset/chat_dataset.csv')
df.head()

In [None]:
df['sentiment'].value_counts()

In [None]:
df.isna().sum()

# 🧹 Text Preprocessing
---

### Cleaning Text

In [None]:
# Variable to replace all characters that are not letters or whitespace
regex = re.compile('[^a-z\s]')
# Convert text to lower case
df['clean_text'] = df['message'].str.lower()
# Removes all characters that are not letters or spaces
df['clean_text'] = df['clean_text'].apply(lambda x: regex.sub('', x))
# Removes all characters between square brackets
df['clean_text'] = df['clean_text'].str.replace('[%s]', '', regex = True)
# Remove the accents
df['clean_text'] = df['clean_text'].apply(lambda x: regex.sub('', unidecode(x)))
# Remove ing
df['clean_text'] = df['clean_text'].str.replace('ing', '', regex = True)
# Remove words with less than 3 characters
df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 2]))


### Correct the words

In [None]:
# Function to correct the words
def correct_spelling(text):
    blob = TextBlob(text)
    return str(blob.correct())

# Apply the function
df['corrected_text'] = df['clean_text'].apply(correct_spelling)

# Result
df['corrected_text'].head()

### Removing StopWords

In [None]:
# Tokenization
df['text_without_stopwords'] = df['corrected_text'].apply(word_tokenize)

# Removing stopwords
stop_words = set(stopwords.words('english'))
df['text_without_stopwords'] = df['text_without_stopwords'].apply(lambda x: [word for word in x if word not in stop_words])

# Result
df['text_without_stopwords'].head()

### Word Normalization

In [None]:
# Word normalization
stemmer = SnowballStemmer('english')
df['final_text'] = df['text_without_stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])

# Joining the words back into a single text
df['final_text'] = df['final_text'].apply(lambda x: ' '.join(x))

# Result
df['final_text'].head()

# 📊 Visualization
---

### Top 10 most used words

In [None]:
words = []
for text in df['final_text']:
    words.extend(text.split())
word_count = collections.Counter(words)
top_words = dict(word_count.most_common(10))

# Figure Size
plt.figure(figsize = (10, 6))

# Create the Barplot
plt.bar(range(len(top_words)), list(top_words.values()), align = 'center')

# Creating a y axis with words
plt.xticks(range(len(top_words)), list(top_words.keys()))

# Grid Opacity
plt.grid(alpha = 0.5)

# Title
plt.title('Top 10 most used words', fontsize = 18)

# Labels
plt.xlabel('Words')
plt.ylabel('Frequency')

### Words Graph

In [None]:
text = ' '.join(caption for caption in df['final_text'])

wordcloud = WordCloud(width = 800, height = 500, background_color = 'black', 
                min_font_size = 10).generate(text)

plt.figure(figsize = (10, 6), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show()

# 🏋️ Training
---

### Changing sentiment column classes to numbers
neutral == 0 | positive == 1 | negative == 2

In [None]:
df['sentiment'] = df['sentiment'].map({'neutral' : 0, 'positive' : 1, 'negative' : 2})

### Separating Data for Sorting with CountVectorizer

In [None]:
# Split X and Y
x = np.array(df['final_text'])
y = np.array(df['sentiment'])

# Convert text to numerical
cv = CountVectorizer()
X = cv.fit_transform(x)

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.25, random_state = 0)

### Training

In [None]:
# Models
mnb = MultinomialNB()
bnb = BernoulliNB()

# Training
mnb.fit(xtrain, ytrain)
bnb.fit(xtrain, ytrain)

### Predicts

In [None]:
# Predictions
pred_mnb = mnb.predict(xtest)
pred_bnb = bnb.predict(xtest)

# 📋 Results
---

### MultinomialNB

In [None]:
print(f'Multinomial Accuracy: {accuracy_score(ytest, pred_mnb)}')
print('-' * 55)
print(classification_report(ytest, pred_mnb))

### Bernoulli Accuracy

In [None]:
print(f'Bernoulli Accuracy: {accuracy_score(ytest, pred_bnb)}')
print('-' * 55)
print(classification_report(ytest, pred_bnb))

In [None]:
cm = confusion_matrix(ytest, pred_mnb)
sns.heatmap(cm, annot = True, cmap = 'Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()