## Import Librarries

In [42]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

import re 
import string
from nltk.corpus import stopwords
import warnings 
warnings.filterwarnings("ignore")

## Read Data

In [43]:
df = pd.read_csv("Sentiment.csv")

## Information About Data

In [44]:
df.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


----------------------
## Text Cleaning 

In [4]:
def remove_tweet_header(text):
    header = re.compile(r"RT @.*:")
    return header.sub(r"" , text)

#-----------------------
def remove_mentions(text):
    mention = re.compile(r"@[A-Za-z0-9_]+")
    return mention.sub(r"" , text)    

#-----------------------
def remove_hashtags(text):
    hashtag = re.compile(r"#\w+")
    return hashtag.sub(r"" , text) 

#-----------------------
def remove_numeric(text):
    nums = re.compile(r"\d")
    return nums.sub(r"",text)
#-----------------------
def remove_URL(text):
    url =re.compile( r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

#-----------------------

def remove_html(text):
    html =re.compile( r"<.*?>")
    return html.sub(r"", text)

#-----------------------

def remove_emojis(text):
    emoji = re.compile(
   "["
       u"\U0001F600-\U0001F64F"  # emoticons
       u"\U0001F300-\U0001F5FF"  # symbols & pictographs
       u"\U0001F680-\U0001F6FF"  # transport & map symbols
       u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
       u"\U00002500-\U00002BEF"  # chinese char
       u"\U00002702-\U000027B0"
       u"\U00002702-\U000027B0"
       u"\U000024C2-\U0001F251"
       u"\U0001f926-\U0001f937"
       u"\U00010000-\U0010ffff"
       u"\u2640-\u2642" 
       u"\u2600-\u2B55"
       u"\u200d"
       u"\u23cf"
       u"\u23e9"
       u"\u231a"
       u"\ufe0f"  # dingbats
       u"\u3030"
       "]+", flags=re.UNICODE
    )
    return emoji.sub(r"",text)

#-----------------------

def remove_punct(text):
    table = str.maketrans("","",string.punctuation)
    return text.translate(table)

#-----------------------

def remove_stopwords(text):
    stop = stopwords.words("english")
    text = [word.lower() for word in  text.split(" ") if word not in stop]
    return  " ".join(text)

In [5]:
x = df["text"].map(remove_tweet_header)
x = x.map(remove_numeric)
x = x.map(remove_mentions)
#x= x.map(remove_hashtags)
x = x.map(remove_URL)
x = x.map(remove_html)
x = x.map(remove_emojis)
x= x.map(remove_punct)
x = x.map(remove_stopwords)

In [6]:
from nltk.stem import PorterStemmer

def stemming(text):
    stemmer = PorterStemmer()
    stemmed = []
    for word in text.split():
        stemmed.append(stemmer.stem(word))
    return " ".join(stemmed)

x = x.map(stemming)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_generate(text , n_gram = (1,1)):
    tfidf_vectorizer = TfidfVectorizer(ngram_range=n_gram)
    return tfidf_vectorizer.fit_transform(text)
    

In [8]:
x_tfidf = tfidf_generate(x)
y = pd.get_dummies(df["sentiment"]).values

In [9]:
lables = list(df["sentiment"].value_counts().keys())
y_ord = df["sentiment"].replace(lables[0],0)
y_ord = y_ord.replace(lables[1],1)
y_ord = y_ord.replace(lables[2],2)
y_ord.value_counts()

0     8493
1     3142
22    2236
Name: sentiment, dtype: int64

## Split Data

In [10]:
from sklearn.model_selection import train_test_split 

x_train , x_test , y_train , y_test = train_test_split(x_tfidf,y_ord,random_state = 123 , stratify=y)

## Build Model [ML]

In [11]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight="balanced")

model.fit(x_train,y_train)

LogisticRegression(class_weight='balanced')

In [12]:
from sklearn.metrics import classification_report
y_pred = model.predict(x_test)

print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.83      0.65      0.73      2123
           1       0.43      0.58      0.49       786
          22       0.48      0.62      0.54       559

    accuracy                           0.63      3468
   macro avg       0.58      0.62      0.59      3468
weighted avg       0.68      0.63      0.65      3468



---------------
## Prepare For Embedding 

In [13]:
from collections import Counter
#count unique words 
def count_words(text):
    count = Counter()
    for sent in text.values :
        for word in sent.split():
            count[word]+=1
    return count

counter = count_words(x)
num_words = len(counter)
num_words

10366

In [14]:
def max_len(text):
    length = []
    for sent in text.values :
        count = Counter()
        for word in sent.split():
            count[word]+=1
        length.append(len(count))
    return max(length)

max_length = max_len(x)
max_length

22

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=num_words)

tokenizer.fit_on_texts(x)

In [16]:
word_index = tokenizer.word_index

In [17]:
x_toknized_sents = tokenizer.texts_to_sequences(x)

In [18]:
print(x[2])
x_toknized_sents[2]

no mention tamir rice gopdeb held cleveland wow


[93, 213, 3364, 3365, 1, 1390, 727, 442]

In [19]:
from keras.preprocessing.sequence import pad_sequences 

x_padded = pad_sequences(x_toknized_sents , maxlen= max_length,padding="post")
x_padded[2]

array([  93,  213, 3364, 3365,    1, 1390,  727,  442,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [20]:
x_train , x_test , y_train , y_test = train_test_split(x_padded,y,random_state=123)

## Build Neural Network [LSTM]

In [21]:
import tensorflow as tf 
from tensorflow import keras 


model = keras.models.Sequential([
        keras.layers.Embedding(num_words , 128, input_length= max_length),
        keras.layers.LSTM(128,dropout=0.1,recurrent_dropout=0.2),
        keras.layers.Dense(128,activation = "tanh"),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(64,activation = "tanh"),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(64,activation = "tanh"),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(32,activation = "tanh"),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(32,activation = "tanh"),
        keras.layers.Dense(3,activation = "softmax")

])

model.compile(
    optimizer = "adam",
    loss = "categorical_crossentropy",
    metrics = ['accuracy']
)
model.fit(x_train,y_train,epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x2b517c1aca0>

In [22]:
model.evaluate(x_test,y_test)



[1.3213591575622559, 0.6286044120788574]

-------------