In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('/content/judge-1377884607_tweet_product_company (1).csv', encoding='ISO-8859-1')
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [5]:
data.columns

Index(['tweet_text', 'emotion_in_tweet_is_directed_at',
       'is_there_an_emotion_directed_at_a_brand_or_product'],
      dtype='object')

In [6]:
data.is_there_an_emotion_directed_at_a_brand_or_product.unique()

array(['Negative emotion', 'Positive emotion',
       'No emotion toward brand or product', "I can't tell"], dtype=object)

In [7]:
data.drop('emotion_in_tweet_is_directed_at',axis=1, inplace=True)

In [8]:
data

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion
...,...,...
9088,Ipad everywhere. #SXSW {link},Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,No emotion toward brand or product


In [None]:
#data preprocessing

In [9]:
import string
import re
import nltk

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

wordnet_lem = WordNetLemmatizer()

def remove_punctuation_of_text(text):
    punctuation_free = ''.join([i for i in text if i not in string.punctuation])
    no_links = re.sub(r'http\S+', '', punctuation_free)
    return no_links

def tokenization(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(cleaned_text)
    return tokens

stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    no_stop_words = [word for word in text if word.lower() not in stop_words]
    return no_stop_words

def lemmatized_text(text):
    lemma = [wordnet_lem.lemmatize(word) for word in text]
    return lemma

def lower_case(text):
    text_list = [item.lower() for item in text]
    return text_list

def preprocess(column):
    corpus = []
    for item in column:
        if not isinstance(item, str):
            item = str(item)
        new_item = tokenization(item)
        new_item = lower_case(new_item)
        new_item = remove_stop_words(new_item)
        new_item = lemmatized_text(new_item)
        corpus.append(new_item)
    return corpus

data['processed'] = preprocess(data['tweet_text'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [10]:
data.processed = data.processed.apply(lambda x: ' '.join(x))

In [None]:
#tokenization

In [11]:
from keras.preprocessing import text
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(data.processed))
tokenized_text = tokenizer.texts_to_sequences(data['processed'])
from keras.utils import pad_sequences


In [12]:
x = pad_sequences(tokenized_text, maxlen=100)

In [13]:
len(tokenizer.word_index)

9528

In [None]:
#label encoding

In [14]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
encoded_labels = le.fit_transform(data.is_there_an_emotion_directed_at_a_brand_or_product)
y_encoded = to_categorical(encoded_labels, num_classes=4)
x_train, x_test, y_train, y_test = train_test_split(x,y_encoded, test_size=0.2)

In [None]:
#data modeling

In [15]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, SimpleRNN, Dropout
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=100))
model.add(LSTM(50, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(30))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4, activation='softmax'))

In [16]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1219712   
                                                                 
 lstm (LSTM)                 (None, 100, 50)           35800     
                                                                 
 dropout (Dropout)           (None, 100, 50)           0         
                                                                 
 lstm_1 (LSTM)               (None, 30)                9720      
                                                                 
 dropout_1 (Dropout)         (None, 30)                0         
                                                                 
 dense (Dense)               (None, 50)                1550      
                                                                 
 dropout_2 (Dropout)         (None, 50)                0

In [17]:
training_history = model.fit(x_train, y_train, epochs=10, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
data.tweet_text[0]

'.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.'

In [19]:
y_preds = model.predict(x_test)



In [22]:
y_preds

array([[2.4361225e-06, 6.7462652e-06, 7.1040489e-04, 9.9928045e-01],
       [3.1658690e-10, 7.7320039e-10, 9.9995154e-01, 4.8393937e-05],
       [4.1887379e-04, 3.8152572e-04, 6.5537512e-01, 3.4382439e-01],
       ...,
       [3.0740602e-03, 2.5106587e-03, 5.5129218e-01, 4.4312307e-01],
       [1.8225432e-10, 4.3864876e-10, 9.9994385e-01, 5.6128683e-05],
       [1.0475206e-10, 2.6344513e-10, 9.9996817e-01, 3.1887161e-05]],
      dtype=float32)

In [21]:
y_pred_labels = np.argmax(y_preds, axis=1)

In [23]:
labels = le.inverse_transform(y_pred_labels)
labels

array(['Positive emotion', 'No emotion toward brand or product',
       'No emotion toward brand or product', ...,
       'No emotion toward brand or product',
       'No emotion toward brand or product',
       'No emotion toward brand or product'], dtype=object)

In [24]:
y_test_labels = np.argmax(y_test, axis=1)

In [25]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test_labels, y_pred_labels)

0.680043980208906

In [None]:
#Accuracy score is 68