# Prepare data

In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_pickle("kaggle_data/train_data.pkl")
test_data = pd.read_pickle("kaggle_data/test_data.pkl")

# data transform and predict

## TFIDF with n samples

In [2]:
TFIDF_train_data = train_data.sample(n=20000)
#TFIDF_train_data = train_data

In [3]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
#TFIDF_vectorizer = TfidfVectorizer(tokenizer=nltk.word_tokenize)
TFIDF_vectorizer = TfidfVectorizer(max_features=1000, tokenizer=nltk.word_tokenize)
TFIDF_vectorizer.fit(TFIDF_train_data['text'])



TfidfVectorizer(max_features=1000,
                tokenizer=<function word_tokenize at 0x00000138D9A00CA0>)

In [4]:
import os
from sklearn.model_selection import train_test_split
from scipy import sparse

trained_tokenized = TFIDF_vectorizer.transform(TFIDF_train_data['text'])
trained_answer = TFIDF_train_data['emotion']

X_train, X_test, Y_train, Y_test = train_test_split(trained_tokenized, trained_answer, test_size=0.25, random_state=42)

target = TFIDF_vectorizer.transform(test_data['text'])

'''
if os.path.isfile('kaggle_data/TFIDF_X_train.npz'):
    X_train = sparse.load_npz('kaggle_data/TFIDF_X_train.npz')
else:
    X_train = TFIDF_vectorizer.transform(train_data['text'])
    sparse.save_npz("kaggle_data/TFIDF_X_train.npz", X_train)
    
Y_train = train_data['emotion']
 
if os.path.isfile('kaggle_data/TFIDF_X_test.npz'):
    X_test = sparse.load_npz('kaggle_data/TFIDF_X_test.npz')
else:
    X_test = TFIDF_vectorizer.transform(test_data['text'])
    sparse.save_npz("kaggle_data/TFIDF_X_test.npz", X_test)
'''

'\nif os.path.isfile(\'kaggle_data/TFIDF_X_train.npz\'):\n    X_train = sparse.load_npz(\'kaggle_data/TFIDF_X_train.npz\')\nelse:\n    X_train = TFIDF_vectorizer.transform(train_data[\'text\'])\n    sparse.save_npz("kaggle_data/TFIDF_X_train.npz", X_train)\n    \nY_train = train_data[\'emotion\']\n \nif os.path.isfile(\'kaggle_data/TFIDF_X_test.npz\'):\n    X_test = sparse.load_npz(\'kaggle_data/TFIDF_X_test.npz\')\nelse:\n    X_test = TFIDF_vectorizer.transform(test_data[\'text\'])\n    sparse.save_npz("kaggle_data/TFIDF_X_test.npz", X_test)\n'

In [5]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(15000, 1000)
(15000,)
(5000, 1000)
(5000,)


### decision tree

In [24]:
from sklearn.tree import DecisionTreeClassifier
DT_model = DecisionTreeClassifier(random_state=1)

DT_model = DT_model.fit(X_train, Y_train)

Y_train_pred = DT_model.predict(X_train)
Y_test_pred = DT_model.predict(X_test)

## so we get the pred result
Y_train_pred

array(['joy', 'joy', 'joy', ..., 'joy', 'joy', 'anticipation'],
      dtype=object)

In [25]:
## accuracy
from sklearn.metrics import accuracy_score

acc_train = accuracy_score(y_true=Y_train, y_pred=Y_train_pred)

print('training accuracy: {}'.format(round(acc_train, 2)))

training accuracy: 1.0


In [26]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true=Y_train, y_pred=Y_train_pred) 
print(cm)

[[ 267    0    1    0    1    1    0    0]
 [   0 1674    1    0    4    1    0    0]
 [   0    1 1003    0    0    0    0    0]
 [   0    0    0  428    3    0    0    0]
 [   0    1    1    0 3501    1    0    0]
 [   1    0    1    0   11 1373    0    0]
 [   0    1    0    0    1    1  332    0]
 [   0    1    3    0    9    0    1 1376]]


### naive bayes

In [47]:
from sklearn.naive_bayes import MultinomialNB
NB_model = MultinomialNB()

In [48]:
NB_model.fit(X_train, Y_train)

Y_train_pred = NB_model.predict(X_train)
Y_result = NB_model.predict(X_test)

In [49]:
from sklearn.metrics import accuracy_score

acc_train = accuracy_score(y_true=Y_train, y_pred=Y_train_pred)
print('training accuracy: {}'.format(round(acc_train, 2)))

training accuracy: 0.4


In [50]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true=Y_train, y_pred=Y_train_pred) 
print(cm)

[[  12    1    0    0  257    0    0    0]
 [   0  279    0    0 1400    1    0    0]
 [   0    0    9    0  994    1    0    0]
 [   0    1    0    0  430    0    0    0]
 [   0    0    0    0 3504    0    0    0]
 [   0    0    0    0 1250  136    0    0]
 [   0    0    0    0  335    0    0    0]
 [   0   12    0    0 1335    0    0   43]]


### Random forest

In [66]:
from sklearn.ensemble import RandomForestClassifier
forest_model = RandomForestClassifier(n_estimators=100,n_jobs = -1,random_state =50, min_samples_leaf = 10)

In [67]:
forest_model = forest_model.fit(X_train, Y_train)

Y_train_pred = forest_model.predict(X_train)
Y_result = forest_model.predict(X_test)

In [68]:
from sklearn.metrics import accuracy_score

acc_train = accuracy_score(y_true=Y_train, y_pred=Y_train_pred)
print('training accuracy: {}'.format(round(acc_train, 2)))

training accuracy: 0.48


In [69]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true=Y_train, y_pred=Y_train_pred) 
print(cm)

[[  38    7   16    0  450   34    0    0]
 [   0  976   20    1 2253   75    3    7]
 [   0   18  304    2 1367  162    0    2]
 [   0   12    7  108  732   36    0    1]
 [   0   66   10    0 7023   60    0   12]
 [   0   38   21    1 1770  836    1    1]
 [   1    4    9    0  563   58   68    0]
 [   0   83    7    1 2460   54    2  220]]


### deep learning model
kera taught by class

In [65]:
import keras
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(train_data['emotion'])
print('check label: ', label_encoder.classes_)
print('y_train[0:4]:\n', Y_train[0:4])

def label_encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

Y_train = label_encode(label_encoder, Y_train)
Y_test = label_encode(label_encoder, Y_test)

print('\n\n## After convert')
print('Y_train[0:4]:\n', Y_train[0:4])


check label:  ['anger' 'anticipation' 'disgust' 'fear' 'joy' 'sadness' 'surprise'
 'trust']
y_train[0:4]:
 [[1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]]


ValueError: y should be a 1d array, got an array of shape (15000, 8) instead.

In [66]:
# I/O check
input_shape = X_train.shape[1]
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_)
print('output_shape: ', output_shape)

input_shape:  1000
output_shape:  8


In [67]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, GlobalAvgPool1D
from keras.layers import ReLU, Softmax, Dropout

# input layer
model_input = Input(shape=(input_shape, ))
X = model_input

# 1st hidden layer
X_W1 = Dense(units=128)(X)  # 64
H1 = ReLU()(X_W1)

# 2nd hidden layer
H1_W2 = Dense(units=128)(H1)  # 64
H2 = ReLU()(H1_W2)

# output layer
H2_W3 = Dense(units=output_shape)(H2)  # 4
H3 = Softmax()(H2_W3)

model_output = H3

# create model
model = Model(inputs=[model_input], outputs=[model_output])

# loss function & optimizer
model.compile(optimizer='adam',
              loss="categorical_crossentropy",
              metrics=['accuracy'])

# show model construction
model.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 1000)]            0         
                                                                 
 dense_18 (Dense)            (None, 128)               128128    
                                                                 
 re_lu_10 (ReLU)             (None, 128)               0         
                                                                 
 dense_19 (Dense)            (None, 128)               16512     
                                                                 
 re_lu_11 (ReLU)             (None, 128)               0         
                                                                 
 dense_20 (Dense)            (None, 8)                 1032      
                                                                 
 softmax_5 (Softmax)         (None, 8)                 0   

In [68]:
X_train

<15000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 207342 stored elements in Compressed Sparse Row format>

In [69]:
# need to change to sparse tensor, so it can be model's input

import tensorflow as tf

def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.sparse.reorder(tf.SparseTensor(indices, coo.data, coo.shape))

X_train_enc = convert_sparse_matrix_to_sparse_tensor(X_train)
X_test_enc = convert_sparse_matrix_to_sparse_tensor(X_test)
target_enc = convert_sparse_matrix_to_sparse_tensor(target)

In [70]:
from keras.callbacks import CSVLogger

csv_logger = CSVLogger('logs/kaggle_training_log.csv')

# training setting
epochs = 50
batch_size = 32

# training!
history = model.fit(X_train_enc, Y_train, 
                    epochs=epochs, 
                    batch_size=batch_size, 
                    callbacks=[csv_logger],
                    validation_data = (X_test_enc, Y_test))
print('training finish')

Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50

KeyboardInterrupt: 

In [18]:
## predict
target_result = model.predict(target_enc, batch_size=128)
target_result = label_decode(label_encoder, target_result)
target_result[:5]



array(['anticipation', 'anticipation', 'anticipation', 'trust',
       'anticipation'], dtype=object)

# result to csv

In [19]:
result_csv = pd.DataFrame(columns=['id', 'emotion'])

In [20]:
test_data

Unnamed: 0,hashtags,tweet_id,text,identification
2,[bibleverse],0x28b412,"Confident of your obedience, I write to you, k...",test
4,[],0x2de201,"""Trust is not the same as faith. A friend is s...",test
9,"[materialism, money, possessions]",0x218443,When do you have enough ? When are you satisfi...,test
30,"[GodsPlan, GodsWork]",0x2939d5,"God woke you up, now chase the day #GodsPlan #...",test
33,[],0x26289a,"In these tough times, who do YOU turn to as yo...",test
...,...,...,...,...
1867525,[],0x2913b4,"""For this is the message that ye heard from th...",test
1867529,[],0x2a980e,"""There is a lad here, which hath five barley l...",test
1867530,"[mixedfeeling, butimTHATperson]",0x316b80,When you buy the last 2 tickets remaining for ...,test
1867531,[],0x29d0cb,I swear all this hard work gone pay off one da...,test


In [21]:
result_csv['id'] = test_data['tweet_id']
result_csv

Unnamed: 0,id,emotion
2,0x28b412,
4,0x2de201,
9,0x218443,
30,0x2939d5,
33,0x26289a,
...,...,...
1867525,0x2913b4,
1867529,0x2a980e,
1867530,0x316b80,
1867531,0x29d0cb,


In [22]:
result_csv['emotion'] = target_result
result_csv

Unnamed: 0,id,emotion
2,0x28b412,anticipation
4,0x2de201,anticipation
9,0x218443,anticipation
30,0x2939d5,trust
33,0x26289a,anticipation
...,...,...
1867525,0x2913b4,joy
1867529,0x2a980e,disgust
1867530,0x316b80,anticipation
1867531,0x29d0cb,joy


In [23]:
result_csv.to_csv("kaggle_data/result.csv", index=False)