<a href="https://colab.research.google.com/github/AlyAhmedRbk/NLP-project-with-ML/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv('train.txt', sep=";", header=None, names=['text', 'emotion'])

In [5]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [6]:
df.isnull()

Unnamed: 0,text,emotion
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
15995,False,False
15996,False,False
15997,False,False
15998,False,False


In [7]:
df.isnull().sum()

Unnamed: 0,0
text,0
emotion,0


In [8]:
df['emotion'].unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [9]:
unique_emotion = df['emotion'].unique()
emotion_nos = {}
i = 0
for emo in unique_emotion:
  emotion_nos[emo] = i
  i += 1

In [10]:
df['emotion'] = df['emotion'].map(emotion_nos)

In [11]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [12]:
df['text'] = df['text'].apply(lambda x : x.lower())

In [13]:
import string

def remove_punc(text):
  return text.translate(str.maketrans('','',string.punctuation))

In [14]:
df['text'] = df['text'].apply(remove_punc)

In [15]:
def remove_nos(txt):
  new = ""
  for i in txt:
    if not i.isdigit():
      new = new + i
  return new

df['text'] = df['text'].apply(remove_nos)

In [16]:
def remove_emojis(text):
  new = ''
  for i in text:
    if i.isascii():
      new = new + i
  return new

df['text'] = df['text'].apply(remove_emojis)


In [17]:
import nltk

In [18]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [19]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [20]:
stop_words = set(stopwords.words('english'))

In [21]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [22]:
len(stop_words)

198

In [23]:
df.loc[1]['text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [24]:
def remove(txt):
  # words = word_tokenize(txt)
  words = txt.split()
  cleanned = []
  for i in words:
    if not i in stop_words:
      cleanned.append(i)
  return ' '.join(cleanned)

df['text'] = df['text'].apply(remove)



In [25]:
df.head()

Unnamed: 0,text,emotion
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


In [26]:
# Bagofwords (BoW)
from sklearn.feature_extraction.text import CountVectorizer

documents = [
    "I love pizza",
    "Pizza is the best",
    "I love pasta",
    "pasta is great"
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

print("Vocabulary : ", vectorizer.get_feature_names_out())
print("\nBoW Matrix : \n", X.toarray())

Vocabulary :  ['best' 'great' 'is' 'love' 'pasta' 'pizza' 'the']

BoW Matrix : 
 [[0 0 0 1 0 1 0]
 [1 0 1 0 0 1 1]
 [0 0 0 1 1 0 0]
 [0 1 1 0 1 0 0]]


In [27]:
# TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [
    "I love pizza",
    "Pizza is the best",
    "I love pasta",
    "pasta is great"
]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

print("Vocabulary : ", vectorizer.get_feature_names_out())
print("\nTFIDF Matrix : \n", X.toarray())

Vocabulary :  ['best' 'great' 'is' 'love' 'pasta' 'pizza' 'the']

TFIDF Matrix : 
 [[0.         0.         0.         0.70710678 0.         0.70710678
  0.        ]
 [0.55528266 0.         0.43779123 0.         0.         0.43779123
  0.55528266]
 [0.         0.         0.         0.70710678 0.70710678 0.
  0.        ]
 [0.         0.66767854 0.52640543 0.         0.52640543 0.
  0.        ]]


In [44]:
from sklearn.model_selection import train_test_split

X = df['text']
y = df['emotion']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [51]:
bow_vectorizer = CountVectorizer()

X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

In [52]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [53]:
nb_model  = MultinomialNB()

In [54]:
nb_model.fit(X_train_bow, y_train)

In [56]:
pred_bow = nb_model.predict(X_test_bow)

In [57]:
print(accuracy_score(y_test, pred_bow))

0.768125


In [58]:
pred_bow

array([0, 5, 0, ..., 5, 5, 0])

In [59]:
y_test

Unnamed: 0,emotion
8756,0
4660,5
6095,0
304,5
8241,0
...,...
15578,5
5746,5
6395,5
7624,5


In [78]:
# Using TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [79]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [80]:
nb_model2  = MultinomialNB()

In [81]:
nb_model2.fit(X_train_bow, y_train)

In [82]:
pred_tfidf = nb_model2.predict(X_test_tfidf)

In [83]:
print(accuracy_score(y_test, pred_tfidf))

0.7275


In [84]:
# Using Logastic Regression
from sklearn.linear_model import LogisticRegression

In [89]:
logistic_model = LogisticRegression(max_iter=1000)

In [90]:
logistic_model.fit(X_train_tfidf, y_train)

In [92]:
log_pred = logistic_model.predict(X_test_tfidf)

In [94]:
print(accuracy_score(y_test, log_pred))

0.8628125
