In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import sparse
import matplotlib.pyplot as plt
from gensim.models import word2vec

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import os

# Prepare data

In [2]:
train_data = pd.read_pickle("kaggle_data/train_data.pkl")
test_data = pd.read_pickle("kaggle_data/test_data.pkl")

# Data preprocess

## Selecting n samples

In [3]:
train_data['emotion'].value_counts()

joy             516017
anticipation    248935
trust           205478
sadness         193437
disgust         139101
fear             63999
surprise         48729
anger            39867
Name: emotion, dtype: int64

In [4]:
sample_train_data = train_data

In [5]:
sample_train_data = train_data.groupby("emotion").sample(n=39867, random_state=1)

In [6]:
sample_train_data.sample(frac=1)
sample_train_data['emotion'].value_counts()

disgust         39867
surprise        39867
joy             39867
trust           39867
anger           39867
fear            39867
sadness         39867
anticipation    39867
Name: emotion, dtype: int64

## Text vectorization use TFIDF and stemmer

In [6]:
porter = PorterStemmer()

def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [7]:
stem_datas = []
for sentence in sample_train_data['text']:
    stem_datas.append(stemSentence(sentence))

sample_train_data['stem_text'] = stem_datas

In [9]:
stem_datas = []
for sentence in test_data['text']:
    stem_datas.append(stemSentence(sentence))

test_data['stem_text'] = stem_datas

In [10]:
TFIDF_vectorizer = CountVectorizer(max_features=500, tokenizer=word_tokenize)
TFIDF_vectorizer.fit(sample_train_data['stem_text'])



CountVectorizer(max_features=500,
                tokenizer=<function word_tokenize at 0x000002CD80ED6820>)

In [11]:
trained_tokenized = TFIDF_vectorizer.transform(sample_train_data['stem_text'])
trained_answer = TFIDF_train_data['emotion']
target = TFIDF_vectorizer.transform(test_data['stem_text'])

## Split train, test

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(trained_tokenized, trained_answer, test_size=0.25, random_state=42)

In [13]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(180000, 500)
(180000,)
(60000, 500)
(60000,)


## Text vectorization with word embedding

In [7]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(sample_train_data['text'])

trained_vectors = tokenizer.texts_to_sequences(sample_train_data['text'])
target = tokenizer.texts_to_sequences(test_data['text'])

vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

print(sample_train_data['text'].iloc[2])
print(trained_vectors[2])

290704
@BakaShift really Alain? You just followed me now? <LH>
[89, 6, 25, 1662, 17, 51, 1]


In [8]:
maxlen = 100

trained_vectors = pad_sequences(trained_vectors, padding='post', maxlen=maxlen)
trained_answer = sample_train_data['emotion']
target = pad_sequences(target, padding='post', maxlen=maxlen)

print(trained_vectors[0, :])

[   3   81   53    6  811   22   30    4   36   95    1 2916    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(trained_vectors, trained_answer, test_size=0.03, random_state=42)

In [10]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(309367, 100)
(309367,)
(9569, 100)
(9569,)


## Text vectorization with w2v

In [6]:
tokenized_train_features = [word_tokenize(each_train_text) for each_train_text in sample_train_data['text']]
tokenized_test_features = [word_tokenize(each_test_text) for each_test_text in test_data['text']]

In [7]:
vector_size = 300

w2v_model = word2vec.Word2Vec(
    tokenized_train_features,
    vector_size=vector_size,
    window=20,
    min_count=1,
    sg=1  # 1 for skip-gram; otherwise CBOW
)

In [8]:
vocab_list = list(w2v_model.wv.key_to_index.keys())
vocab_size = len(vocab_list)
vocab_list

['LH',
 '>',
 '<',
 '#',
 '@',
 '.',
 '!',
 'the',
 'I',
 'to',
 ',',
 'a',
 'and',
 'you',
 'is',
 '?',
 'of',
 'for',
 'in',
 'my',
 'on',
 'it',
 'that',
 "'s",
 "n't",
 'me',
 'your',
 'are',
 'be',
 '...',
 'with',
 'have',
 'this',
 '&',
 '’',
 'not',
 'at',
 'do',
 'so',
 'all',
 'just',
 'was',
 'but',
 'The',
 "'m",
 'up',
 'like',
 '``',
 "''",
 'can',
 ':',
 'get',
 'we',
 'out',
 'You',
 'from',
 'what',
 'will',
 'about',
 'by',
 '=',
 'when',
 'they',
 'today',
 'life',
 'one',
 '..',
 'people',
 'as',
 'day',
 'It',
 'no',
 'love',
 'time',
 'now',
 'he',
 'has',
 'who',
 'an',
 'God',
 '-',
 'know',
 'or',
 'i',
 'only',
 'how',
 'more',
 ')',
 'got',
 'realDonaldTrump',
 'am',
 'our',
 'if',
 's',
 'good',
 'go',
 'there',
 't',
 'make',
 'see',
 'did',
 'his',
 'What',
 'been',
 "'re",
 'So',
 '(',
 'back',
 'When',
 'My',
 '2',
 'going',
 'want',
 'u',
 'still',
 '....',
 'need',
 'would',
 'some',
 'really',
 'them',
 'work',
 'We',
 'us',
 'ca',
 'new',
 'had',
 'm

In [None]:
def remove_OOV_vocab(sample: list, list_vocab):
    """ Takes in tokenized sample in the form of list 
    and the vocabulary list and removes tokens from sample
    that are not in the vocabulary list"""
    in_vocab_sample = []
    for each_token in sample:
        if each_token in list_vocab:
            in_vocab_sample.append(each_token)
    return in_vocab_sample
  
tokenized_test_features = [remove_OOV_vocab(each_test_sample, vocab_list) for each_test_sample in tokenized_test_features]

In [9]:
vocab = w2v_model.wv.key_to_index.keys()
embedding_matrix = w2v_model.wv[vocab]

In [10]:
def w2v_indexed_token_sequences(w2v_model, list_features):
    indexed_features = []
    for each_seq in list_features:
        list_token_indices = []
        for each_token in each_seq:
            try:
                list_token_indices.append(w2v_model.wv.key_to_index[each_token])
            except KeyError as e:
                continue
        indexed_features.append(list_token_indices)
    return indexed_features

indexed_train_features = w2v_indexed_token_sequences(w2v_model, tokenized_train_features)
indexed_test_features = w2v_indexed_token_sequences(w2v_model, tokenized_test_features)

In [11]:
max_seq_len = 20

padded_train = pad_sequences(indexed_train_features, padding = 'post', maxlen=max_seq_len, truncating='post')
trained_answer = sample_train_data['emotion']
padded_test = pad_sequences(indexed_test_features, padding = 'post', maxlen=max_seq_len, truncating='post')

In [34]:
X_train, X_test, Y_train, Y_test = train_test_split(padded_train, trained_answer, test_size=0.25, random_state=42)

NameError: name 'padded_train' is not defined

In [13]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(180000, 20)
(180000,)
(60000, 20)
(60000,)


# Predict

## decision tree

In [14]:
DT_model = DecisionTreeClassifier(random_state=1)

DT_model = DT_model.fit(X_train, Y_train)

Y_test_pred = DT_model.predict(X_test)

In [15]:
## accuracy
acc_train = accuracy_score(y_true=Y_test, y_pred=Y_test_pred)
print('training accuracy: {}'.format(round(acc_train, 2)))

training accuracy: 0.31


In [16]:
cm = confusion_matrix(y_true=Y_test, y_pred=Y_test_pred) 
print(cm)

[[2149  610  979  737  633  868  751  622]
 [ 579 2766  643  624  863  540  573  941]
 [1000  626 1955  746  587 1044  892  614]
 [ 737  646  747 2821  622  627  691  661]
 [ 648  809  643  680 2302  596  640 1096]
 [ 918  599 1132  714  651 2073  859  613]
 [ 831  606  973  773  741  864 2168  683]
 [ 658  940  679  625 1128  580  680 2204]]


### naive bayes

In [37]:
NB_model = MultinomialNB()

In [38]:
NB_model.fit(X_train, Y_train)

Y_test_pred = NB_model.predict(X_test)

In [39]:
acc_train = accuracy_score(y_true=Y_test, y_pred=Y_test_pred)
print('training accuracy: {}'.format(round(acc_train, 2)))

training accuracy: 0.16


In [40]:
cm = confusion_matrix(y_true=Y_test, y_pred=Y_test_pred) 
print(cm)

[[1112  911  422 2511  407  364  891  731]
 [ 835 1492  391 2310  440  386  977  698]
 [ 835  827  466 2521  483  383 1190  759]
 [ 619  630  318 3309  382  337 1349  608]
 [ 717  690  376 2882  561  397 1169  622]
 [ 794  751  434 2713  400  418 1236  813]
 [ 742  695  394 2853  428  375 1467  685]
 [ 782  848  407 2772  417  396 1084  788]]


### Random forest

In [37]:
forest_model = RandomForestClassifier(n_estimators=100,n_jobs = -1,random_state =50, min_samples_leaf = 10)

In [38]:
forest_model = forest_model.fit(X_train, Y_train)
Y_test_pred = forest_model.predict(X_test)

In [39]:
acc_train = accuracy_score(y_true=Y_test, y_pred=Y_test_pred)
print('training accuracy: {}'.format(round(acc_train, 2)))

training accuracy: 0.25


In [40]:
cm = confusion_matrix(y_true=Y_test, y_pred=Y_test_pred) 
print(cm)

[[258  80 207 134  54  84  49  45]
 [ 90 269 123 131  76  58  46  85]
 [151  91 216 145  44 104  48  58]
 [ 88  81 134 335  57  64  48  81]
 [101 114 129 188 202  64  50  85]
 [151  73 189 152  74 181  51  50]
 [104  69 170 187  72  87 150  62]
 [116  94 136 140  96  82  58 189]]


### Logistic regression

In [41]:
LR = LogisticRegression()

In [42]:
LR = LR.fit(X_train, Y_train)
Y_test_pred = LR.predict(X_test)
target_result = LR.predict(target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [43]:
acc_train = accuracy_score(y_true=Y_test, y_pred=Y_test_pred)
print('training accuracy: {}'.format(round(acc_train, 2)))

training accuracy: 0.15


In [44]:
cm = confusion_matrix(y_true=Y_test, y_pred=Y_test_pred) 
print(cm)

[[242  43 443  33   0  97  49   4]
 [181 120 413  25   0  85  50   4]
 [146  49 470  33   0 124  33   2]
 [157  27 401  68   0 148  84   3]
 [169  41 449  52   0 146  72   4]
 [154  45 495  36   0 129  61   1]
 [150  29 476  51   0 132  62   1]
 [174  47 474  41   0 113  62   0]]


## One hot encoding

In [11]:
label_encoder = LabelEncoder()
label_encoder.fit(train_data['emotion'])
print("classes:", label_encoder.classes_)
onehot_encoder = OneHotEncoder(sparse=False)
print('## Before conver\n')
print(Y_train[0:4])
def label_encode(le, oe, labels):
    enc = le.transform(labels)
    return oe.fit_transform(enc.reshape((len(enc), 1)))
    #return enc
    
def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

Y_train = label_encode(label_encoder, onehot_encoder, Y_train)
Y_test = label_encode(label_encoder, onehot_encoder, Y_test)

print('\n\n## After convert')
print(Y_train[0:4])


classes: ['anger' 'anticipation' 'disgust' 'fear' 'joy' 'sadness' 'surprise'
 'trust']
## Before conver

521966         fear
1061014    surprise
186615        trust
924065         fear
Name: emotion, dtype: object


## After convert
[[0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 0. 0.]]


In [12]:
# I/O check
input_shape = X_train.shape[1]
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_)
print('output_shape: ', output_shape)

input_shape:  100
output_shape:  8


### deep learning model
kera taught by class

In [31]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, GlobalAvgPool1D
from keras.layers import ReLU, Softmax, Dropout

# input layer
model_input = Input(shape=(input_shape, ))
X = model_input

# 1st hidden layer
X_W1 = Dense(units=128)(X)  # 64
H1 = ReLU()(X_W1)

# 2nd hidden layer
H1_W2 = Dense(units=128)(H1)  # 64
H2 = ReLU()(H1_W2)

# output layer
H2_W3 = Dense(units=output_shape)(H2)  # 4
H3 = Softmax()(H2_W3)

model_output = H3

# create model
model = Model(inputs=[model_input], outputs=[model_output])

# loss function & optimizer
model.compile(optimizer='adam',
              loss="categorical_crossentropy",
              metrics=['accuracy'])

# show model construction
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 500)]             0         
                                                                 
 dense (Dense)               (None, 128)               64128     
                                                                 
 re_lu (ReLU)                (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 re_lu_1 (ReLU)              (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 8)                 1032      
                                                                 
 softmax (Softmax)           (None, 8)                 0     

In [32]:
X_train

<180000x500 sparse matrix of type '<class 'numpy.int64'>'
	with 2376028 stored elements in Compressed Sparse Row format>

In [33]:
# need to change to sparse tensor, so it can be model's input

import tensorflow as tf

def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.sparse.reorder(tf.SparseTensor(indices, coo.data, coo.shape))

X_train_enc = convert_sparse_matrix_to_sparse_tensor(X_train)
X_test_enc = convert_sparse_matrix_to_sparse_tensor(X_test)
target_enc = convert_sparse_matrix_to_sparse_tensor(target)

In [34]:
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))
sess

Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: NVIDIA GeForce RTX 3060 Ti, pci bus id: 0000:01:00.0, compute capability: 8.6



<tensorflow.python.client.session.Session at 0x2cd8a777250>

In [35]:
from keras.callbacks import CSVLogger

csv_logger = CSVLogger('logs/kaggle_training_log.csv')

# training setting
epochs = 50
batch_size = 128

# training!
history = model.fit(X_train_enc, Y_train, 
                    epochs=epochs, 
                    batch_size=batch_size, 
                    callbacks=[csv_logger],
                    validation_data = (X_test_enc, Y_test))
print('training finish')

Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50

KeyboardInterrupt: 

In [77]:
## predict
target_result = model.predict(target_enc, batch_size=128)
target_result = label_decode(label_encoder, target_result)
target_result[:5]



array(['anticipation', 'anticipation', 'disgust', 'trust', 'trust'],
      dtype=object)

### keras with embedding

In [13]:
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))
sess

Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: NVIDIA GeForce RTX 3060 Ti, pci bus id: 0000:01:00.0, compute capability: 8.6



<tensorflow.python.client.session.Session at 0x15d25fc7400>

In [21]:
embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=10000, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.Bidirectional(layers.LSTM(40, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(40)))
#model.add(layers.Dense(2048, activation='relu'))
#model.add(layers.Dense(1024, activation='relu'))
model.add(layers.Dense(8, activation='softmax'))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 100, 50)           500000    
                                                                 
 bidirectional_1 (Bidirectio  (None, 100, 80)          29120     
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 80)               38720     
 nal)                                                            
                                                                 
 dense_10 (Dense)            (None, 8)                 648       
                                                                 
Total params: 568,488
Trainable params: 568,488
Non-trainable params: 0
_________________________________________________________________


In [22]:
history = model.fit(X_train, Y_train,
                    epochs=20,
                    validation_data=(X_test, Y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, Y_train)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, Y_test)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Epoch 1/20
Epoch 2/20
Epoch 3/20

KeyboardInterrupt: 

In [None]:
target_result = model.predict(target)
target_result = label_decode(label_encoder, target_result)
target_result[:5]

# result to csv

In [None]:
result_csv = pd.DataFrame(columns=['id', 'emotion'])

In [None]:
test_data

In [None]:
result_csv['id'] = test_data['tweet_id']
result_csv

In [None]:
result_csv['emotion'] = target_result
result_csv

In [None]:
result_csv.to_csv("kaggle_data/result.csv", index=False)