In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

pd.options.display.max_colwidth = 200

In [2]:
df = pd.read_csv('tweet_emotions .csv')
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin on your call...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will."


# EDA

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


In [4]:
df.isna().sum()

tweet_id     0
sentiment    0
content      0
dtype: int64

In [5]:
df['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [6]:
emotion_count = df['sentiment'].nunique()
emotion_count

13

# Preprocessing

In [7]:
df.drop(columns=['tweet_id'], inplace=True) # No valuable information

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment']) # encoding sentiments to numeric values

In [10]:
class_to_index = {i: label for i, label in zip(le.classes_, range(0, len(le.classes_)))}
index_to_class = {label: i for i, label in class_to_index.items()}
class_to_index, index_to_class

({'anger': 0,
  'boredom': 1,
  'empty': 2,
  'enthusiasm': 3,
  'fun': 4,
  'happiness': 5,
  'hate': 6,
  'love': 7,
  'neutral': 8,
  'relief': 9,
  'sadness': 10,
  'surprise': 11,
  'worry': 12},
 {0: 'anger',
  1: 'boredom',
  2: 'empty',
  3: 'enthusiasm',
  4: 'fun',
  5: 'happiness',
  6: 'hate',
  7: 'love',
  8: 'neutral',
  9: 'relief',
  10: 'sadness',
  11: 'surprise',
  12: 'worry'})

In [11]:
df

Unnamed: 0,sentiment,content
0,2,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[
1,10,Layin n bed with a headache ughhhh...waitin on your call...
2,10,Funeral ceremony...gloomy friday...
3,3,wants to hang out with friends SOON!
4,8,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will."
...,...,...
39995,8,@JohnLloydTaylor
39996,7,Happy Mothers Day All my love
39997,7,"Happy Mother's Day to all the mommies out there, be you woman or man as long as you're 'momma' to someone this is your day!"
39998,5,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY NEW HIT SINGLES WWW.MYSPACE.COM/IPSOHOT I DEF. WAT U IN THE VIDEO!!


In [12]:
!pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
import unidecode
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
def clean_text(text):
    #Removing URLs
    text = re.sub('((www.[^s]+)|(https?:\/\/.*?[\s+]))',' ', text)
    #Removing mentions
    text = re.sub('@[\w]*',' ', text)

    # Remove special characters and numbers
    text = re.sub(r'\.{2,}', ' ', text)
    text = re.sub(r',+', ', ', text)
    text = re.sub(r'[^A-Za-zÀ-ú ]+', '', text)
    # Convert to lower case
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    text = unidecode.unidecode(text)
    return text

def remove_stopwords(texto):
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(texto.lower())
    return " ".join([token for token in tokens if token not in stop_words])

lemmatizer = WordNetLemmatizer()
def simple_lemmatizer(text):
    text= ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

In [None]:
df['content_clean'] = df['content'].apply(clean_text)
df['content_clean'] = df['content_clean'].apply(remove_stopwords)
df['content_clean'] = df['content_clean'].apply(simple_lemmatizer)

In [None]:
df.head(30)

# Feature Extraction

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
lengths = [len(t.split(' ')) for t in df['content_clean']]
plt.hist(lengths, bins=len(set(lengths)))
plt.show()

The appropriate max words per sentence seems to be 20

In [None]:
maxlen = 20
vocab_size = 30000

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df['content_clean'])

word_index = tokenizer.word_index
vocab_size = len(word_index)

In [None]:
def get_seqeuences(tokenizer, tweets):
  sequences = tokenizer.texts_to_sequences(tweets)
  padded = pad_sequences(sequences, truncating="post", padding="post", maxlen=maxlen)
  return padded

In [None]:
padded_tweets = get_seqeuences(tokenizer, df['content_clean'])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(padded_tweets, df['sentiment'], test_size=0.2, random_state=134)

# Model Building

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, GlobalAveragePooling1D, Flatten, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

In [26]:
model = Sequential()
model.add(Embedding(vocab_size, 16, input_length = maxlen, embeddings_regularizer=l2(0.01)))
model.add(Bidirectional(LSTM(20, return_sequences=True)))
model.add(Bidirectional(LSTM(20)))
model.add(Dropout(0.4))
model.add(Dense(32, activation = "relu", kernel_reg))
model.add(Dropout(0.4))
model.add(Dense(emotion_count, activation = "softmax"))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 16)            481712    
                                                                 
 bidirectional (Bidirectiona  (None, 20, 40)           5920      
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 40)               9760      
 nal)                                                            
                                                                 
 dropout (Dropout)           (None, 40)                0         
                                                                 
 dense (Dense)               (None, 32)                1312      
                                                                 
 dropout_1 (Dropout)         (None, 32)                0

In [27]:
#model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])
#history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=15)

In [28]:
model.evaluate(x_test, y_test)

RuntimeError: ignored

In [29]:
model2 = Sequential()
model2.add(Embedding(vocab_size, 16, input_length = maxlen))
model2.add(GlobalAveragePooling1D())
model2.add(Dropout(0.3))
model2.add(Dense(16, activation = 'relu'))
model2.add(Dropout(0.7))
model2.add(Dense(13, activation = 'softmax'))
model2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 16)            481712    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                                 
 dense_2 (Dense)             (None, 16)                272       
                                                                 
 dropout_3 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 13)                221       
                                                      

In [None]:
model2.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.0005), metrics=['accuracy'])
history = model2.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=40, callbacks=[EarlyStopping(monitor='val_accuracy', patience=5)])

Epoch 1/40
  97/1000 [=>............................] - ETA: 3:09 - loss: 2.5315 - accuracy: 0.1827

In [None]:
model2.evaluate(x_test, y_test)