In [None]:
# Before proceeding run this cell (but if you have already install then ignore)
# !pip install tensorflow==2.15.0
# !pip install scikit-learn
# !pip install pandas
# !pip install numpy
# !pip install seaborn
# !pip install matplotlib
# !pip install wordcloud
# !pip install nltk

In [None]:
# !pip install keras-preprocessing

#Load Libraries

In [None]:

## dl packages
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# ml packages
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import pickle
import nltk
import re
from nltk.stem import PorterStemmer

import seaborn as sns
import matplotlib.pyplot as plt

# .)The WordCloud library in Python is a visualization tool used
# to create word clouds from a text corpus. Word clouds are
# graphical representations of words where the size of each
#  word indicates its frequency or importance within the
#  given text data. The more frequent a word appears in
#  the text, the larger and bolder it appears in the word cloud.
from wordcloud import WordCloud

In [None]:
# --------just for info----------
# .)the blw to_categorical example.

from keras.utils import to_categorical

# Example class labels (as integers)
class_labels = [0, 1, 2, 1, 0, 2]

# Convert class labels to categorical (one-hot encoded) representation
one_hot_encoded = to_categorical(class_labels)

print(one_hot_encoded)


#Load data

In [None]:
train_data = pd.read_csv("train.txt", header=None, sep=";", names=["Comment", "Emotion"], encoding="utf-8")
train_data.head()

In [None]:
train_data = pd.read_csv("train.txt", header=None, sep=";", names=["Comment", "Emotion"], encoding="utf-8")
train_data.head()
# get all words length in comment
train_data['length'] = [len(x) for x in train_data['Comment']]
train_data.head()

In [None]:
train_data.shape

In [None]:
train_data.isnull().sum()

In [None]:
train_data.duplicated().sum()

In [None]:
train_data.drop_duplicates(inplace=True)

#EDA

In [None]:
sns.countplot(x = train_data['Emotion'])
plt.show()

In [None]:
# data distribution
df2 = train_data.copy()
df2['length'] = [len(x) for x in df2['Comment']]

# Convert the 'length' column to a numpy array
length_values = df2['length'].values

# Use sns.histplot instead of sns.kdeplot for simplicity
sns.histplot(data=df2, x='length', hue='Emotion', multiple='stack')


In [None]:
# Words colud for each emotions.
def words_cloud(wordcloud,emotion):
  plt.figure(figsize=(10,10))
  plt.title(emotion + " Word Cloud")
  plt.imshow(wordcloud)
  plt.axis("off")
emotions_list = train_data['Emotion'].unique()
for emotion in emotions_list:
  # .)The blw loop will run for eevery unque emotions and for every
  # unique emotions and for that unique emotion the blw loop will
  # run for every comment and will be connected by " " and store
  # in text  and wordcl;oud it is passend word cloud wil find the text
  # words that blengs to word of emotion and remove stop words if will
  # create the list of words from that commments usin wordcloud.
  text = " ".join([sentence for sentence in train_data.loc[train_data['Emotion']==emotion,'Comment']])
  wordcloud = WordCloud(width=600,height=600).generate(text)
  words_cloud(wordcloud,emotion)

Data Preprocessing

Encode emotions

In [None]:
# encoding the categorical column target column of emotions.
lb = LabelEncoder()
train_data['Emotion_Encoded'] = lb.fit_transform(train_data['Emotion'])
train_data

Applying Machine Learning

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report

In [None]:
df = train_data.copy()

In [None]:
df

## Data cleaning and preprocessing

In [None]:
# Download NLTK stopwords
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))
def clean_text(text):
  stemmer = PorterStemmer()
  text = re.sub("[^a-zA_Z]"," ",text)
  text = text.lower()
  text = text.split()
  text = [stemmer.stem(word) for word in text if word not in stopwords]

  return " ".join(text)

df['clean_comment']= df['Comment'].apply(clean_text)

In [None]:
df

In [None]:
# Train_test_split
X_train,X_test,y_train,y_test = train_test_split(df['clean_comment'],
                          df['Emotion_Encoded'],test_size=0.2,random_state=42)

In [None]:
# Vectorization using TF-IDF
# .)This vectorizer will tke out the most imp words it will se the words which
# are repeating less no of times i single documnt and in all documnt
# then it formula will tke the words out whic are more imp.
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Multi-class classification using different algorithms
classifiers = {
    "Multinomial Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
}
for name,clf in classifiers.items():
  print(f"\n================={name}=================")
  clf.fit(X_train_tfidf,y_train)
  y_pred_tfidf = clf.predict(X_test_tfidf)
  print(f"\nAccuracy using TF-IDF: {accuracy_score(y_test,y_pred_tfidf)}")
  print(classification_report(y_test,y_pred_tfidf))

In [None]:
# selecting model
lg = LogisticRegression()
lg.fit(X_train_tfidf, y_train)
lg_y_pred = lg.predict(X_test_tfidf)

In [None]:
def predict_emotion(input_text):
  cleaned_text = clean_text(input_text)
  input_vectorized = tfidf_vectorizer.transform([cleaned_text])
  predicted_label = lg.predict(input_vectorized)[0]
  predicted_emotion = lb.inverse_transform([predicted_label])[0]

  label =  np.max(lg.predict(input_vectorized))

  return predicted_emotion,label

  # Example usage
sentences = [
            "i didnt feel humiliated",
            "i feel strong and good overall",
            "im grabbing a minute to post i feel greedy wrong",
            "He was speechles when he found out he was accepted to this new job",
            "This is outrageous, how can you talk like that?",
            "I feel like im all alone in this world",
            "He is really sweet and caring",
            "You made me very crazy",
            "i am ever feeling nostalgic about the fireplace i will know that it is still on the property",
            "i am feeling grouchy",
            "He hates you"
            ]
for sentence in sentences:
  print(sentence)
  predict_emotionn,label = predict_emotion(sentence)
  print("Predicted Emotion :",predict_emotionn)
  print("Predicted  Label :",label)
  print("====================================================")

In [None]:
# -save files.
import pickle
pickle.dump(lg,open("Logistic_regression.pkl","wb"))
pickle.dump(lb,open("Label_encoder.pkl","wb"))
pickle.dump(tfidf_vectorizer,open("tfidf_vectorizer.pkl","wb"))

In [None]:
import sklearn
print(sklearn.__version__)

#Applying Deep learning Using LSTM
#Text Cleaning, Ecoding, and Padding

In [None]:
max(len(w) for w in train_data['Comment'])

In [None]:
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords

In [None]:

text = " ".join(["i love myself",'I hate you','We love ourselves'])
one_hot(input_text=text,n=100)

In [None]:
def clean_text(df,column,vocab_size,max_len):
  stemmer = PorterStemmer()
  corpus = []
  for text in df[column]:
     
    text = text.lower()
    text = text.split()
    text = [stemmer.stem(w) for w in text if w not in stopwords]
    text = " ".join(text)
    # .)Now converting the words to onehot vectors for each sentence
    # there will be each vector for every words.
    corpus.append(text)
  one_hot_word = [one_hot(input_text=word,n=vocab_size) for word in corpus]
  pad = pad_sequences(sequences=one_hot_word,maxlen= max_len,
                        padding='pre')
  return pad
x_train = clean_text(train_data,'Comment',vocab_size=11000,max_len=300)

In [None]:
x_train.shape

In [None]:
lb = LabelEncoder()
train_data['Emotion'] = lb.fit_transform(train_data['Emotion'])

In [None]:
# -------Using to_categorical simple info------:
# .)You can use the to_categorical function to convert these
#  categorical labels into one-hot encoded vectors:

#  ----exmple-
# import numpy as np
# from keras.utils import to_categorical

# # Categorical labels
# labels = np.array([1, 2, 0, 3, 2])

# # Convert categorical labels to one-hot encoded vectors
# one_hot_labels = to_categorical(labels)

# print("Original Labels:")
# print(labels)

# print("\nOne-Hot Encoded Labels:")
# print(one_hot_labels)
# # Original Labels:
# [1 2 0 3 2]
# # One-Hot Encoded Labels:
# [[0. 1. 0. 0.]
#  [0. 0. 1. 0.]
#  [1. 0. 0. 0.]
#  [0. 0. 0. 1.]
#  [0. 0. 1. 0.]]



In [None]:
y_train =to_categorical(train_data['Emotion'])

In [None]:
y_train.shape

In [None]:
train_data

In [None]:
# .)In deep learning, an embedding layer is a fundamental
# component used primarily in natural language processing
#  (NLP) tasks to convert categorical data, such as words
#   or characters, into dense vectors of fixed size.
#    These dense vectors, called embeddings, represent
#    the semantic meaning or contextual information of
#    the input categorical data in a continuous vector space.

# Here's a simple explanation and an example of an embedding layer:

# Explanation:
# Purpose of Embedding Layer:

# It learns and maps each categorical input
#  (e.g., words from a vocabulary) to a continuous
#   vector space, where words with similar meanings
#    or contexts are closer together in this space.

# Functionality:

# The embedding layer is the initial layer in a neural
# network that performs the mapping from discrete categorical
# data (like words represented as integers) to continuous vector
#  representations (embeddings).
# It is typically used as the first layer in a neural
# network model for NLP tasks.

# Model Building and Training

In [None]:
'''
input_dim: This parameter specifies the size of the vocabulary, i.e., the total number of unique words in your dataset. In this case, it's set to 11000, indicating that your vocabulary has 11000 unique words.

output_dim: This parameter specifies the dimensionality of the dense embeddings. Each word in the vocabulary will be represented by a dense vector of this size. In this case, it's set to 150, meaning that each word will be represented by a 150-dimensional vector.

input_length: This parameter specifies the length of input sequences, i.e., the number of words in each sequence. In this case, it's set to 300, indicating that your model expects input sequences with a length of 300 words.

So, the Embedding layer in this context is creating a word embedding matrix with a vocabulary size of 11000, where each word is represented by a dense vector of 150 dimensions. The input sequences are expected to have a length of 300 words. The purpose of the Embedding layer is to learn dense representations of words in a continuous vector space, which can be further processed by other layers in your neural network.
'''

model = Sequential()
# .)The blw embedding layer do the wor embedding. which means it
# converts the input sequence to dense vectors.

model.add(Embedding(input_dim=11000,output_dim=150,input_length=300))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(64,activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(6,activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(x_train,y_train,epochs=5,batch_size=64,verbose=1)

In [None]:
# .)The function sentence_cleaning appears to perform the
#  following steps on the input sentence:

# 1)Removes non-alphabetic characters and converts the text to lowercase.
# 2)Splits the sentence into words.
# 3)Stems each word using the Porter Stemmer (from NLTK).
# 4)Removes stopwords (assuming stopwords is defined somewhere
#   in the code, typically a list of common words that do not
#               contribute much to the meaning of a sentence).
# 5)Joins the preprocessed words into a string (text)
# and appends it to the corpus list.
# 6)Converts the text into a one-hot encoded representation
# based on a vocabulary size of 11000.
# 7)Pads the one-hot encoded representation to a fixed length
#  of 300 tokens.
# 8)After cleaning each sentence, the code attempts to predict
# the sentiment using a model (model.predict(sentence)) and print the predicted result along with the associated probability.

# ----sentence_cleaning Function:
# .)This function takes in a sentence as input
# and performs various text preprocessing steps on it.

# ---Text Preprocessing:

# .)It uses regular expressions (re.sub) to remove any characters that are not alphabetic and replaces them with spaces.
# .)Converts the text to lowercase.
# .)Splits the sentence into individual words.
# .)Stemming and Stopword Removal:

# .)It applies stemming to each word in the sentence using the Porter Stemmer (stemmer.stem(word)). Stemming reduces words to their root form.
# .)Removes stopwords (assuming stopwords is defined elsewhere in the code). Stopwords are common words (like "and," "the," "is") that often do not carry significant meaning in the context of analysis.
# -----One-Hot Encoding:

# .)Joins the preprocessed words into a string (text) and
#  appends it to the corpus list.
# .)Converts the text into a one-hot encoded representation
# using a vocabulary size of 11000.

# ------Padding Sequences:

# .)Pads the one-hot encoded representation to a fixed
#  length of 300 tokens using pad_sequences
#   (assuming it's defined elsewhere in the code). This ensures that all sequences have the same length.
# -----Predicting Sentiment:
# .)The code then defines a list of example sentences
#  (sentences) and iterates through each sentence:

# .)For each sentence, it cleans the sentence using the
# sentence_cleaning function.

# .)It tries to make predictions on sentiment using a model
#  (model.predict(sentence)). However, the actual model definition
#   (model) and its associated components like lb
#    (assumed to be a label encoder) are not provided in the snippet.

# .)It attempts to print the predicted sentiment and
# its associated probability (result and proba respectively).

# .)Please note that for the code to fully work, it requires
#  additional parts such as the definition of the stopwords
#   list, the actual model for sentiment prediction (model),
#    the label encoder (lb), and potentially other functions
#     (one_hot, pad_sequences) to be defined and provided
#     elsewhere in the code.

#Predictive System

In [None]:
# Text cleaning function
def sentence_cleaning(sentence):
    stemmer = PorterStemmer()
    corpus = []
    text = re.sub("[^a-zA-Z]", " ", sentence)
    text = text.lower()
    text = text.split()
    text = [stemmer.stem(word) for word in text if word not in stopwords]
    text = " ".join(text)
    print("Text: ", text)
    corpus.append(text)
    print("Corpus",corpus)
    # .)The blw one hot will give the intergers no from 0 to 11000
    # to thewords stored in corpus sentence by sentence inthe
    # form of array it will assign the unique word in that range
    one_hot_word = [one_hot(input_text=word, n=11000) for word in corpus]
    print("One Hot words",one_hot_word)
    # .)In blw if any sent has 3 integers after one hot and other has 4
    # so there wil be differnce in size so the blw padding of
    # 300 will be prely applied to each sentence so the
    # input of size 300 will be supplied to neural netrowk
    pad = pad_sequences(sequences=one_hot_word, maxlen=300, padding='pre')
    print("Pad",pad)
    return pad

# load model and predict
sentences = [
            "i feel strong and good overall",
            "im grabbing a minute to post i feel greedy wrong",
            "He was speechles when he found out he was accepted to this new job",
            "This is outrageous, how can you talk like that?",
            "I feel like im all alone in this world",
            "He is really sweet and caring",
            "You made me very crazy",
            "i am ever feeling nostalgic about the fireplace i will know that it is still on the property",
            "i am feeling grouchy",
            "He hates you"
            ]
for sentence in sentences:
    print(sentence)
    sentence = sentence_cleaning(sentence)
    result = lb.inverse_transform(np.argmax(model.predict(sentence), axis=-1))[0]
    proba =  np.max(model.predict(sentence))
    print(f"{result} : {proba}\n\n")

In [None]:
# Text cleaning function
def sentence_cleaning(sentence):
    stemmer = PorterStemmer()
    corpus = []
    text = re.sub("[^a-zA-Z]", " ", sentence)
    text = text.lower()
    text = text.split()
    text = [stemmer.stem(word) for word in text if word not in stopwords]
    text = " ".join(text)
    corpus.append(text)
    one_hot_word = [one_hot(input_text=word, n=11000) for word in corpus]
    pad = pad_sequences(sequences=one_hot_word, maxlen=300, padding='pre')
    return pad

# load model and predict
sentences = [
            "i feel strong and good overall",
            "im grabbing a minute to post i feel greedy wrong",
            "He was speechles when he found out he was accepted to this new job",
            "This is outrageous, how can you talk like that?",
            "I feel like im all alone in this world",
            "He is really sweet and caring",
            "You made me very crazy",
            "i am ever feeling nostalgic about the fireplace i will know that it is still on the property",
            "i am feeling grouchy",
            "He hates you"
            ]
for sentence in sentences:
    print(sentence)
    sentence = sentence_cleaning(sentence)
    # result = model.predict(sentence)
    # print(f"{result}")
    result = lb.inverse_transform(np.argmax(model.predict(sentence), axis=-1))[0]
    proba =  np.max(model.predict(sentence))
    print(f"{result} : {proba}\n\n")

#Save the model and files

In [None]:
model.save('model1.h5')

# Save the LabelEncoder
with open('lb1.pkl', 'wb') as f:
    pickle.dump(lb, f)

# Save vocabulary size and max length
vocab_info = {'vocab_size': 11000, 'max_len': 300}
with open('vocab_info.pkl', 'wb') as f:
    pickle.dump(vocab_info, f)