Importing Librabries

In [1]:
import numpy as np
import pandas as pd

Loading Dataset

In [2]:
import chardet

# To identify the encoding used
with open('/content/judge-1377884607_tweet_product_company.csv', 'rb') as f:
    encoding = chardet.detect(f.read())['encoding']

encoding

'MacRoman'

In [3]:
data = pd.read_csv('/content/judge-1377884607_tweet_product_company.csv', encoding='MacRoman')

EDA

In [4]:
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [6]:
data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

is_there_an_emotion_directed_at_a_brand_or_product
No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: count, dtype: int64

In [7]:
twt = data.drop('emotion_in_tweet_is_directed_at', axis=1)

In [8]:
# mapping values in "is_there_an_emotion_directed_at_a_brand_or_product" ,


emotion_mapping = {
    "No emotion toward brand or product": 0,
    "Positive emotion": 1,
    "Negative emotion": 2,
    "I can't tell": 0
}

twt['is_there_an_emotion_directed_at_a_brand_or_product'] = twt['is_there_an_emotion_directed_at_a_brand_or_product'].map(emotion_mapping)

twt.head()


Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,2
1,@jessedee Know about @fludapp ? Awesome iPad/i...,1
2,@swonderlin Can not wait for #iPad 2 also. The...,1
3,@sxsw I hope this year's festival isn't as cra...,2
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,1


In [9]:
#dropping null values
twt = twt.dropna()
twt.reset_index(drop=True, inplace=True)
twt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9092 entries, 0 to 9091
Data columns (total 2 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   is_there_an_emotion_directed_at_a_brand_or_product  9092 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 142.2+ KB


In [13]:
import gensim
twt['cleaned_twt'] = twt['tweet_text'].apply(lambda x:gensim.utils.simple_preprocess(x))
twt.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product,cleaned_twt
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,2,"[wesley, have, iphone, after, hrs, tweeting, a..."
1,@jessedee Know about @fludapp ? Awesome iPad/i...,1,"[jessedee, know, about, fludapp, awesome, ipad..."
2,@swonderlin Can not wait for #iPad 2 also. The...,1,"[swonderlin, can, not, wait, for, ipad, also, ..."
3,@sxsw I hope this year's festival isn't as cra...,2,"[sxsw, hope, this, year, festival, isn, as, cr..."
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,1,"[sxtxstate, great, stuff, on, fri, sxsw, maris..."


In [15]:
# unique word count in "cleaned_twt"

unique_words = set()
for tweet in twt['cleaned_twt']:
  for word in tweet:
    unique_words.add(word)
unq_word_count = len(unique_words)
print(f"Number of unique words: {unq_word_count}")


Number of unique words: 9333


In [16]:
# Tokenizing
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=unq_word_count)
tokenizer.fit_on_texts(twt['cleaned_twt'])
seq = tokenizer.texts_to_sequences(twt['cleaned_twt'])

In [17]:
# Padding sequences
from keras.preprocessing.sequence import pad_sequences
padded_Seq = pad_sequences(seq, maxlen=100, padding='post', truncating='post')

In [18]:
# One-Hot Encoding for sentiment labels
from sklearn.preprocessing import LabelEncoder
label_en = LabelEncoder()
target_en = label_en.fit_transform(twt['is_there_an_emotion_directed_at_a_brand_or_product'])

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_Seq, target_en, test_size=0.2, random_state=42)

Building model

In [21]:
from keras.layers import Dense, LSTM, Embedding
from keras.models import Sequential
from keras.layers import Dropout

In [23]:
model = Sequential()
model.add(Embedding(input_dim=unq_word_count, output_dim=5, input_length=100))
model.add(LSTM(50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(60))
model.add(Dense(50, activation='relu'))
model.add(Dense(3, activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 5)            46665     
                                                                 
 lstm_2 (LSTM)               (None, 100, 50)           11200     
                                                                 
 dropout_1 (Dropout)         (None, 100, 50)           0         
                                                                 
 lstm_3 (LSTM)               (None, 60)                26640     
                                                                 
 dense_2 (Dense)             (None, 50)                3050      
                                                                 
 dense_3 (Dense)             (None, 3)                 153       
                                                                 
Total params: 87708 (342.61 KB)
Trainable params: 8770

In [24]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [27]:
#Training
history = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
