# Doc2vec + CNN

In [1]:
# Display progress logs on stdout
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')

## Import Original Data

In [2]:
import numpy as np
import pandas as pd
import csv

train_df = pd.read_csv('origin_data/train_tweets.txt', 
                 encoding="utf-8",
                 header=None, sep='\t',
                quoting=csv.QUOTE_NONE)
# df = pd.DataFrame(data)
train_df.columns = ['id', 'tweet']
print(train_df.shape)

unLabel_df = pd.read_csv('origin_data/test_tweets_unlabeled.txt', 
                      header=None,
                      sep='\t', 
                      quoting=csv.QUOTE_NONE)
unLabel_df.columns = ['tweet']
print(unLabel_df.shape)

(328932, 2)
(35437, 1)


## Pre-processing
To obtain the whole corpus, pre-processing both train and test data

1. Lower case
2. Removing Punctuation
3. Removal of Stop Words
4. Common word removal
5. Rare words removal
6. Spelling correction
7. Tokenization
8. Lemmatization (not Stemming)

In [3]:
import copy
from nltk.corpus import stopwords
stop = stopwords.words('english')
from textblob import TextBlob
from textblob import Word

def preProcess(originData):
    res = copy.deepcopy(originData)
    # Lower case
    res['tweet'] = res['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    print("Lower case, Done");
    
    # Removing Punctuation
    res['tweet'] = res['tweet'].str.replace('[^\w\s]','')
    print("Removing Punctuation, Done");
    
    # Removal of Stop Words
    res['tweet'] = res['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    print("Removal of Stop Words, Done");
    
    # Common word removal
    NUM_TOP_WORDS = 10
    freq = pd.Series(' '.join(res['tweet']).split()).value_counts()[:NUM_TOP_WORDS]
    freq_index = list(freq.index)
    res['tweet'] = res['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
    print("Common word removal, Done");
    
    # Rare words removal
    NUM_TAIL_WORDS = -10
    freq = pd.Series(' '.join(res['tweet']).split()).value_counts()[NUM_TAIL_WORDS:]
    freq_index = list(freq.index)
    res['tweet'] = res['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
    print("Rare word removal, Done");
    
    # Spelling correction(to slow)
#     res['tweet'] = res['tweet'].apply(lambda x: str(TextBlob(x).correct()))
#     print("Spelling correction, Done");
    
    # Tokenization
    res['tweet'] = res['tweet'].apply(lambda x: " ".join(x for x in TextBlob(x).words))
    print("Tokenization, Done");
    
    # Lemmatization
    res['tweet'] = res['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    print("Lemmatization, Done");
    return res
 

In [4]:
pre_train_df = preProcess(train_df)

Lower case, Done
Removing Punctuation, Done
Removal of Stop Words, Done
Common word removal, Done
Rare word removal, Done
Tokenization, Done
Lemmatization, Done


In [5]:
pre_train_df.shape

(328932, 2)

In [6]:
pre_unLabel_df = preProcess(unLabel_df)

Lower case, Done
Removing Punctuation, Done
Removal of Stop Words, Done
Common word removal, Done
Rare word removal, Done
Tokenization, Done
Lemmatization, Done


In [7]:
pre_unLabel_df.shape

(35437, 1)

In [8]:
# import os
# path = os.getcwd() + '/data'
# if not os.path.isdir(path):
#     os.mkdir(path)
    
# # delete all
# files = os.listdir(path)
# for file in files:
#     if os.path.exists(file):
#         os.remove(file)

# for i in range(pre_train_df.id.size):
#     with open(path + '/' + str(pre_train_df.id[i]) + '.txt', mode='a+') as f:
#         f.write(pre_train_df.tweet[i] + '\n')

### sort for train sample

In [118]:
sort_pre_train_df = pre_train_df.sort_values(by = "id")
sort_pre_train_df.head()

Unnamed: 0,id,tweet
328529,2,mike look little bit harrey carrey glass httpt...
328537,2,actually almost entire elton john catalogdigit...
328536,2,really want elton john album cant justify purc...
328535,2,cube buddy elton john morningnothing music kee...
328534,2,announcing winner noon giving 10 follower craf...


## Prepare a corpus as training set for doc2vec
To train doc2vec model, we need to create a training set consist of line of tweets. One tweet per line, one line per tweet. 

Further research is needed to find out whether preprocessing on the tweets here are necessary.

In [9]:
# with open("ebd_train.txt", "w") as f:
#     for tweet in train_df.tweet:
#         f.write(str(tweet) + '\n')
#     for tweet in unLabel_df.tweet:
#         f.write(str(tweet) + '\n')

Train the doc2vec model with the embedding training set and save the model. 

Since it takes a long time to train, it is better to load the model I have trained called **model.bin**.

If you want to train it yourself, please run it in terminal using `python3 train_doc2vec_model.py`.

## infer vectors

In [119]:
#load model
modelPath = "model300-100.bin"
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
model = Doc2Vec.load(modelPath)
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

2019-09-08 15:50:45,076 loading Doc2Vec object from model300-100.bin
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-09-08 15:50:45,620 loading vocabulary recursively from model300-100.bin.vocabulary.* with mmap=None
2019-09-08 15:50:45,621 loading trainables recursively from model300-100.bin.trainables.* with mmap=None
2019-09-08 15:50:45,621 loading syn1neg from model300-100.bin.trainables.syn1neg.npy with mmap=None
2019-09-08 15:50:45,636 loading wv recursively from model300-100.bin.wv.* with mmap=None
2019-09-08 15:50:45,637 loading vectors from model300-100.bin.wv.vectors.npy with mmap=None
2019-09-08 15:50:45,673 loading docvecs recursively from model300-100.bin.docvecs.* with mmap=None
2019-09-08 15:50:45,674 loading vectors_docs from model300-100.bin.docvecs.vectors_docs.npy with mmap=None
2019-09-08 15:50:45,885 loaded model300-100.bin


In [120]:
#inference hyper-parameters
# start_alpha=0.01
# infer_epoch=100

train_vecs = []
# for x in pre_train_df.tweet:
#     train_vecs.append(model.infer_vector(x.split(), alpha=start_alpha, steps=infer_epoch))
for x in sort_pre_train_df.tweet:
    train_vecs.append(model.infer_vector(x.split()))

In [121]:
unLabel_vecs = []
for x in sort_pre_train_df.tweet:
    unLabel_vecs.append(model.infer_vector(x.split()))

## write to file, no need to train every time.

## Try PCA
Not good

In [97]:
from sklearn.decomposition import PCA
pca = PCA(0.99)
pca.fit(train_vecs)

PCA(copy=True, iterated_power='auto', n_components=0.99, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [98]:
pca.n_components_

295

## Classification

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score 
from sklearn.externals import joblib
model_save_path = "model_save/"



### SVM
https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC

Training: O(m^2N^2)，Predicting: O(m^2N)

In [122]:
x_train, x_test, y_train, y_test = train_test_split(train_vecs[:2000], 
                                                    sort_pre_train_df.id[:2000], 
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=24) 

In [123]:
# PCA
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)

In [124]:
from time import time
from sklearn.svm import SVC

# Train a SVM classification model
print("Fitting the classifier to the training set...")
t0 = time()
# 0 < C <= 1. Larger better, but may cause overfit
# kernel='rbf'时（default），为高斯核，
# gamma值越小，分类界面越连续；
# gamma值越大，分类界面越“散”，分类效果越好，但有可能会过拟合。
svm_clf = SVC(kernel='rbf', gamma=1000)

svm_clf.fit(x_train_pca, y_train)

print("done in %0.3fs" % (time() - t0))

Fitting the classifier to the training set...
done in 2.077s


In [125]:
print("Predicting...")
t0 = time()
predict_train = svm_clf.predict(x_train_pca)
predict_test = svm_clf.predict(x_test_pca)
print("done in %0.3fs" % (time() - t0))

Predicting...
done in 1.183s


In [126]:
print('f1 for train = ' , f1_score(y_train, predict_train, average='micro'))
print('f1 for test = ' , f1_score(y_test, predict_test, average='micro'))

f1 for train =  0.965
f1 for test =  0.0675


### Save the model

In [104]:
save_path_name = model_save_path + "svm_" + "train_model.m"
joblib.dump(svm_clf, save_path_name)
# clf = joblib.load(save_path_name)

['model_save/svm_train_model.m']

### Decision Tree

In [127]:
from time import time
from sklearn import tree
print("Fitting the classifier to the training set...")
t0 = time()
dt_clf = tree.DecisionTreeClassifier(criterion='entropy')
dt_clf.fit(x_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))

Fitting the classifier to the training set...
done in 2.498s


In [128]:
print("Predicting...")
t0 = time()
predict_train = dt_clf.predict(x_train_pca)
predict_test = dt_clf.predict(x_test_pca)
print("done in %0.3fs" % (time() - t0))

Predicting...
done in 0.004s


In [129]:
print('f1 for train = ' , f1_score(y_train, predict_train, average='micro'))
print('f1 for test = ' , f1_score(y_test, predict_test, average='micro'))

f1 for train =  0.998125
f1 for test =  0.0625


The cnn classifier based on docvecs.
1. First, read (docvec,author) pairs from training and test set.
2. Second, build a CNN with keras.
3. Then, train the CNN with training set.
4. Finally, test the CNN with test set.

In [63]:
pre_train_df.tweet[1].split(" ")

['going', 'watch', 'grey', 'big', 'screen', 'thursday', 'indulgence']

# Submision Creating
Predict Submission:
https://www.kaggle.com/t/cb6ceb3bf96a48819d6b4f0994fb58db

In [32]:
predict_answer = svm_clf.predict(pca.transform(unLabel_vecs))

In [33]:
len(predict_answer)

35437

In [34]:
with open("submission.txt", "w") as f:
    f.write('Id,Predicted\n')
    index = 0
    for i in predict_answer:
        index += 1
        f.write(str(index) + ',' + str(i) + '\n')