In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import sparse
import matplotlib.pyplot as plt
from gensim.models import word2vec

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import os

# Prepare data

In [2]:
train_data = pd.read_pickle("kaggle_data/train_data.pkl")
test_data = pd.read_pickle("kaggle_data/test_data.pkl")

# Data preprocess

## Selecting all data

In [3]:
train_data['emotion'].value_counts()

joy             516017
anticipation    248935
trust           205478
sadness         193437
disgust         139101
fear             63999
surprise         48729
anger            39867
Name: emotion, dtype: int64

In [4]:
sample_train_data = train_data

## Equalize the number of data (work bad)

In [14]:
sample_train_data = train_data.groupby("emotion").sample(n=39867, random_state=1)

In [15]:
sample_train_data.sample(frac=1)
sample_train_data['emotion'].value_counts()

anger           39867
anticipation    39867
disgust         39867
fear            39867
joy             39867
sadness         39867
surprise        39867
trust           39867
Name: emotion, dtype: int64

## Text vectorization use TFIDF and stemmer (work bad)

In [None]:
porter = PorterStemmer()

def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [None]:
stem_datas = []
for sentence in sample_train_data['text']:
    stem_datas.append(stemSentence(sentence))

sample_train_data['stem_text'] = stem_datas

In [None]:
stem_datas = []
for sentence in test_data['text']:
    stem_datas.append(stemSentence(sentence))

test_data['stem_text'] = stem_datas

In [None]:
TFIDF_vectorizer = CountVectorizer(max_features=500, tokenizer=word_tokenize)
TFIDF_vectorizer.fit(sample_train_data['stem_text'])

In [None]:
trained_tokenized = TFIDF_vectorizer.transform(sample_train_data['stem_text'])
trained_answer = TFIDF_train_data['emotion']
target = TFIDF_vectorizer.transform(test_data['stem_text'])

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(trained_tokenized, trained_answer, test_size=0.25, random_state=42)

In [None]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

## Text vectorization with word embedding (Work best)

In [5]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(sample_train_data['text'])

trained_vectors = tokenizer.texts_to_sequences(sample_train_data['text'])
target = tokenizer.texts_to_sequences(test_data['text'])

vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

print(sample_train_data['text'].iloc[2])
print(trained_vectors[2])

912662
Now ISSA is stalking Tasha 😂😂😂 <LH>
[57, 614, 9, 6699, 2493, 892, 1]


In [6]:
maxlen = 100

trained_vectors = pad_sequences(trained_vectors, padding='post', maxlen=maxlen)
trained_answer = sample_train_data['emotion']
target = pad_sequences(target, padding='post', maxlen=maxlen)

print(trained_vectors[0, :])

[  56   59  572 1096   17   13 1173  292   18 1302  132  220    1    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(trained_vectors, trained_answer, test_size=0.03, random_state=42)

In [8]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(1411896, 100)
(1411896,)
(43667, 100)
(43667,)


## with pretrained w2v model (use glove-twitter 27B 100d) (Not bad, but not the best)

In [5]:
embeddings_index = dict()
f = open('./glove/glove.twitter.27B.100d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [6]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(sample_train_data['text'])

# integer encode the documents
trained_vectors = t.texts_to_sequences(sample_train_data['text'])
target = t.texts_to_sequences(test_data['text'])
vocab_size = len(t.word_index) + 1

# padding
maxlen = 100
trained_vectors = pad_sequences(trained_vectors, maxlen=maxlen, padding='post')
trained_answer = sample_train_data['emotion']
target = pad_sequences(target, maxlen=maxlen, padding='post')

In [7]:
# 建造可以轉換為GloVe 100維 詞向量的矩陣
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(trained_vectors, trained_answer, test_size=0.03, random_state=42)

In [9]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(1411896, 100)
(1411896,)
(43667, 100)
(43667,)


# One hot encoding

In [9]:
label_encoder = LabelEncoder()
label_encoder.fit(train_data['emotion'])
print("classes:", label_encoder.classes_)
onehot_encoder = OneHotEncoder(sparse=False)
print('## Before conver\n')
print(Y_train[0:4])
def label_encode(le, oe, labels):
    enc = le.transform(labels)
    return oe.fit_transform(enc.reshape((len(enc), 1)))
    #return enc
    
def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

Y_train = label_encode(label_encoder, onehot_encoder, Y_train)
Y_test = label_encode(label_encoder, onehot_encoder, Y_test)

print('\n\n## After convert')
print(Y_train[0:4])


classes: ['anger' 'anticipation' 'disgust' 'fear' 'joy' 'sadness' 'surprise'
 'trust']
## Before conver

78570         joy
847092        joy
700362      anger
78141     sadness
Name: emotion, dtype: object


## After convert
[[0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]]


In [10]:
# I/O check
input_shape = X_train.shape[1]
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_)
print('output_shape: ', output_shape)

input_shape:  100
output_shape:  8


# Predict (deep learning is better)

## decision tree

In [None]:
DT_model = DecisionTreeClassifier(random_state=1)

DT_model = DT_model.fit(X_train, Y_train)

Y_test_pred = DT_model.predict(X_test)

In [None]:
## accuracy
acc_train = accuracy_score(y_true=Y_test, y_pred=Y_test_pred)
print('training accuracy: {}'.format(round(acc_train, 2)))

In [None]:
cm = confusion_matrix(y_true=Y_test, y_pred=Y_test_pred) 
print(cm)

### naive bayes

In [None]:
NB_model = MultinomialNB()

In [None]:
NB_model.fit(X_train, Y_train)

Y_test_pred = NB_model.predict(X_test)

In [None]:
acc_train = accuracy_score(y_true=Y_test, y_pred=Y_test_pred)
print('training accuracy: {}'.format(round(acc_train, 2)))

In [None]:
cm = confusion_matrix(y_true=Y_test, y_pred=Y_test_pred) 
print(cm)

### Random forest

In [None]:
forest_model = RandomForestClassifier(n_estimators=100,n_jobs = -1,random_state =50, min_samples_leaf = 10)

In [None]:
forest_model = forest_model.fit(X_train, Y_train)
Y_test_pred = forest_model.predict(X_test)

In [None]:
acc_train = accuracy_score(y_true=Y_test, y_pred=Y_test_pred)
print('training accuracy: {}'.format(round(acc_train, 2)))

In [None]:
cm = confusion_matrix(y_true=Y_test, y_pred=Y_test_pred) 
print(cm)

### Logistic regression

In [None]:
LR = LogisticRegression()

In [None]:
LR = LR.fit(X_train, Y_train)
Y_test_pred = LR.predict(X_test)
target_result = LR.predict(target)

In [None]:
acc_train = accuracy_score(y_true=Y_test, y_pred=Y_test_pred)
print('training accuracy: {}'.format(round(acc_train, 2)))

In [None]:
cm = confusion_matrix(y_true=Y_test, y_pred=Y_test_pred) 
print(cm)

## keras with embedding

In [11]:
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))
sess

Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: NVIDIA GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1



<tensorflow.python.client.session.Session at 0x262b53de7f0>

In [12]:
model = Sequential()
# without w2v model
model.add(layers.Embedding(input_dim=10000, 
                           output_dim=50, 
                           input_length=maxlen))
model.add(layers.Bidirectional(layers.LSTM(40, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(40, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(40)))
model.add(layers.Dense(8, activation='softmax'))

'''
# embedding layer with pretrained w2v model
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=100, 
                           weights=[embedding_matrix],
                           input_length=maxlen,
                           trainable=False))
model.add(layers.Bidirectional(layers.LSTM(40, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(40, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(40)))
model.add(layers.Dense(8, activation='softmax'))
'''

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 50)           500000    
                                                                 
 bidirectional (Bidirectiona  (None, 100, 80)          29120     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 100, 80)          38720     
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 80)               38720     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 8)                 648       
                                                        

In [13]:
history = model.fit(X_train, Y_train,
                    epochs=20,
                    validation_data=(X_test, Y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, Y_train)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, Y_test)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Epoch 1/20
   522/141190 [..............................] - ETA: 1:08:11 - loss: 1.7132 - accuracy: 0.3805

KeyboardInterrupt: 

In [15]:
target_result = model.predict(target)
target_result = label_decode(label_encoder, target_result)
target_result[:5]

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: [Errno 28] No space left on device
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: [Errno 28] No space left on device


array(['anticipation', 'anticipation', 'joy', 'joy', 'trust'],
      dtype=object)

# result to csv

In [16]:
result_csv = pd.DataFrame(columns=['id', 'emotion'])

In [17]:
test_data

Unnamed: 0,hashtags,tweet_id,text,identification
2,[bibleverse],0x28b412,"Confident of your obedience, I write to you, k...",test
4,[],0x2de201,"""Trust is not the same as faith. A friend is s...",test
9,"[materialism, money, possessions]",0x218443,When do you have enough ? When are you satisfi...,test
30,"[GodsPlan, GodsWork]",0x2939d5,"God woke you up, now chase the day #GodsPlan #...",test
33,[],0x26289a,"In these tough times, who do YOU turn to as yo...",test
...,...,...,...,...
1867525,[],0x2913b4,"""For this is the message that ye heard from th...",test
1867529,[],0x2a980e,"""There is a lad here, which hath five barley l...",test
1867530,"[mixedfeeling, butimTHATperson]",0x316b80,When you buy the last 2 tickets remaining for ...,test
1867531,[],0x29d0cb,I swear all this hard work gone pay off one da...,test


In [18]:
result_csv['id'] = test_data['tweet_id']
result_csv

Unnamed: 0,id,emotion
2,0x28b412,
4,0x2de201,
9,0x218443,
30,0x2939d5,
33,0x26289a,
...,...,...
1867525,0x2913b4,
1867529,0x2a980e,
1867530,0x316b80,
1867531,0x29d0cb,


In [19]:
result_csv['emotion'] = target_result
result_csv

Unnamed: 0,id,emotion
2,0x28b412,anticipation
4,0x2de201,anticipation
9,0x218443,joy
30,0x2939d5,joy
33,0x26289a,trust
...,...,...
1867525,0x2913b4,anticipation
1867529,0x2a980e,anticipation
1867530,0x316b80,sadness
1867531,0x29d0cb,joy


In [24]:
result_csv.to_csv("kaggle_data/result.csv", index=False)