In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import sklearn as sklearn
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'

# Import data

In [2]:
%cd /Users/Allen/Documents/NLP/dataset
import csv
if __name__ == "__main__":
    fin = open("Sentiment Analysis Dataset.csv", "r")
    fout = open("sad-clean.csv", "w")
    writer = csv.writer(fout)
    try:
        # we know the first 3 columns are consistent...
        for row in fin:
            parts = row.strip().split(",")
            out = parts[0:3] + [",".join(parts[3:])]
            writer.writerow(out)
    finally:
        fin.close()
        fout.close()

/Users/Allen/Documents/NLP/dataset


In [3]:
data = pd.read_csv("sad-clean.csv",encoding='latin-1')
data = data.rename(columns={"v1":"label", "v4":"text"})

In [28]:
data.head(10)

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...
5,6,0,Sentiment140,or i just worry too much?
6,7,1,Sentiment140,Juuuuuuuuuuuuuuuuussssst Chillin!!
7,8,0,Sentiment140,Sunny Again Work Tomorrow :-| ...
8,9,1,Sentiment140,handed in my uniform today . i miss you ...
9,10,1,Sentiment140,hmmmm.... i wonder how she my number @-)


By examining the top 10 tweets, we realize that many words are informal, which makes it more difficult to vectorize the words

In [4]:
data = data.drop(["ItemID", "SentimentSource"], axis=1)
data = data.rename(columns={"Sentiment":"label", "SentimentText":"text"})

from nltk import FreqDist
FreqDist(data["label"])             #result shows that it is a balanced dataset

FreqDist({0: 788442, 1: 790185})

# Examining most frequent words

In [5]:
from nltk.tokenize import RegexpTokenizer 
from nltk.corpus import stopwords
tokenizer = RegexpTokenizer(r'\w+') #tokenize words while removing punctuations
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() #to combine words of same lemma 

In [9]:
#Separate the training set to "pos" and "neg" 
pos = data.loc[data.label == 1]
neg = data.loc[data.label == 0]

In [10]:
posword = list(map(tokenizer.tokenize,pos.text))
negword = list(map(tokenizer.tokenize,neg.text))
#make flatten list
pos_list = [item for sublist in posword for item in sublist]
neg_list = [item for sublist in negword for item in sublist]

In [11]:
#change to lower case
pos_list=list(map(str.lower,pos_list))
neg_list=list(map(str.lower,neg_list))

#use Lemmatizer to combine words of same lemma
pos_list=list(map(lemmatizer.lemmatize,pos_list))
neg_list=list(map(lemmatizer.lemmatize,neg_list))

In [12]:
#create frequency list
posfreq=FreqDist(pos_list)
negfreq=FreqDist(neg_list)

#top 50 words
pos_topwords= posfreq.most_common(80)
neg_topwords= negfreq.most_common(80)

In [13]:
#remove stopwords
pos_topwords = list(filter(lambda x: not str(x[0]).isdigit() and 
                           str(x[0]) not in stopwords.words("english"), pos_topwords))
neg_topwords = list(filter(lambda x: not str(x[0]).isdigit() and 
                           str(x[0]) not in stopwords.words("english"), neg_topwords))

In [14]:
pos_topwords[:15]

[('good', 60865),
 ('day', 54488),
 ('love', 50255),
 ('http', 46264),
 ('quot', 45471),
 ('wa', 45392),
 ('u', 39735),
 ('like', 38297),
 ('get', 38031),
 ('lol', 36000),
 ('com', 34833),
 ('thanks', 34607),
 ('â', 34263),
 ('time', 33983),
 ('going', 30546)]

In [15]:
neg_topwords[:15]

[('wa', 59386),
 ('day', 50092),
 ('get', 47872),
 ('go', 47605),
 ('work', 45586),
 ('like', 41367),
 ('today', 38104),
 ('want', 33688),
 ('going', 33418),
 ('got', 33031),
 ('back', 32674),
 ('miss', 31551),
 ('time', 31319),
 ('really', 31172),
 ('im', 30988)]

# Training Word2vec model
Use tweet tokenizer from NLTK library to tokenize the words and build a w2v model using Gensim library

In [6]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

w2v_data = data["text"]
w2v_data = list(map(tokenizer.tokenize,w2v_data))

In [12]:
# import modules & set up logging
import gensim, logging
from gensim.models import word2vec
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = w2v_data

# train word2vec on the tweets, 100 dimensions
model = gensim.models.Word2Vec(sentences, min_count=1,size=100)

2018-04-15 14:18:59,612 : INFO : collecting all words and their counts
2018-04-15 14:18:59,614 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-04-15 14:18:59,660 : INFO : PROGRESS: at sentence #10000, processed 151136 words, keeping 20327 word types
2018-04-15 14:18:59,715 : INFO : PROGRESS: at sentence #20000, processed 328341 words, keeping 32976 word types
2018-04-15 14:18:59,779 : INFO : PROGRESS: at sentence #30000, processed 480968 words, keeping 39844 word types
2018-04-15 14:18:59,876 : INFO : PROGRESS: at sentence #40000, processed 633803 words, keeping 46029 word types
2018-04-15 14:18:59,986 : INFO : PROGRESS: at sentence #50000, processed 783960 words, keeping 51477 word types
2018-04-15 14:19:00,094 : INFO : PROGRESS: at sentence #60000, processed 934190 words, keeping 56609 word types
2018-04-15 14:19:00,174 : INFO : PROGRESS: at sentence #70000, processed 1085720 words, keeping 61429 word types
2018-04-15 14:19:00,255 : INFO : PROGRESS: at

# Evaluation of word2vec model
Google have released their testing set of about 20,000 syntactic and semantic test examples, following the “A is to B as C is to D” task: https://raw.githubusercontent.com/RaRe-Technologies/gensim/develop/gensim/test/test_data/questions-words.txt.

In [13]:
%cd /Users/Allen/Documents/NLP/dataset
model.accuracy('questions-words.txt')

2018-04-15 14:21:41,979 : INFO : precomputing L2-norms of word weight vectors


/Users/Allen/Documents/NLP/dataset


2018-04-15 14:21:43,536 : INFO : capital-common-countries: 8.1% (22/272)
2018-04-15 14:21:44,738 : INFO : capital-world: 5.1% (15/293)
2018-04-15 14:21:44,931 : INFO : currency: 0.0% (0/40)
2018-04-15 14:21:49,123 : INFO : city-in-state: 2.2% (25/1121)
2018-04-15 14:21:50,430 : INFO : family: 58.2% (199/342)
2018-04-15 14:21:52,674 : INFO : gram1-adjective-to-adverb: 3.5% (21/600)
2018-04-15 14:21:53,294 : INFO : gram2-opposite: 9.6% (15/156)
2018-04-15 14:21:57,804 : INFO : gram3-comparative: 44.9% (534/1190)
2018-04-15 14:22:00,239 : INFO : gram4-superlative: 17.8% (116/650)
2018-04-15 14:22:02,878 : INFO : gram5-present-participle: 53.8% (378/702)
2018-04-15 14:22:05,134 : INFO : gram6-nationality-adjective: 4.1% (24/585)
2018-04-15 14:22:09,588 : INFO : gram7-past-tense: 42.7% (508/1190)
2018-04-15 14:22:14,290 : INFO : gram8-plural: 20.6% (259/1260)
2018-04-15 14:22:15,916 : INFO : gram9-plural-verbs: 38.1% (160/420)
2018-04-15 14:22:15,917 : INFO : total: 25.8% (2276/8821)


[{'correct': [('ATHENS', 'GREECE', 'BANGKOK', 'THAILAND'),
   ('ATHENS', 'GREECE', 'BERLIN', 'GERMANY'),
   ('BEIJING', 'CHINA', 'TEHRAN', 'IRAN'),
   ('BERLIN', 'GERMANY', 'HELSINKI', 'FINLAND'),
   ('BERLIN', 'GERMANY', 'ROME', 'ITALY'),
   ('CAIRO', 'EGYPT', 'ROME', 'ITALY'),
   ('CAIRO', 'EGYPT', 'STOCKHOLM', 'SWEDEN'),
   ('CANBERRA', 'AUSTRALIA', 'OTTAWA', 'CANADA'),
   ('HELSINKI', 'FINLAND', 'TEHRAN', 'IRAN'),
   ('HELSINKI', 'FINLAND', 'BERLIN', 'GERMANY'),
   ('MOSCOW', 'RUSSIA', 'TEHRAN', 'IRAN'),
   ('OSLO', 'NORWAY', 'CAIRO', 'EGYPT'),
   ('OTTAWA', 'CANADA', 'ROME', 'ITALY'),
   ('OTTAWA', 'CANADA', 'BERLIN', 'GERMANY'),
   ('OTTAWA', 'CANADA', 'CANBERRA', 'AUSTRALIA'),
   ('ROME', 'ITALY', 'STOCKHOLM', 'SWEDEN'),
   ('ROME', 'ITALY', 'TEHRAN', 'IRAN'),
   ('ROME', 'ITALY', 'BERLIN', 'GERMANY'),
   ('STOCKHOLM', 'SWEDEN', 'CAIRO', 'EGYPT'),
   ('STOCKHOLM', 'SWEDEN', 'ROME', 'ITALY'),
   ('TOKYO', 'JAPAN', 'HELSINKI', 'FINLAND'),
   ('TOKYO', 'JAPAN', 'MADRID', 'SPAIN')],

The total accuracy is 25.8%, which shows that the accuracy is not very high for most of the questions. However we observe that it has obtained 58.2% accuracy for family-related words and decent scores for gram3-comparative(44.9%)/gram5-present-participle(53.8%)/gram7-past-tense (42.7%) and gram9-plural-verbs(38.1%). From these we can interpret that the tweets are more related to these topics. The below similarity tests also shows that our word2vec model has preserved the relationship between the words.

In [14]:
print(model.most_similar(positive=['woman', 'actress'], negative=['man'], topn=1))
print(model.most_similar(positive=['dad', 'family'], negative=['man'], topn=2))

[('actor', 0.8064113259315491)]
[('grandma', 0.721605658531189), ('mom', 0.7162371873855591)]


In [15]:
print("model.similarity('dad', 'mom'): %f" %model.similarity('dad', 'mom'))
print("model.similarity('happy','homework'): %f" %model.similarity('happy','homework'))

model.similarity('dad', 'mom'): 0.957098
model.similarity('happy','homework'): 0.069131


# w2v for classification 

As compared to the tweet tokenizer, the regular expression tokenizer results in better performance in classification task. Therefore I build a new w2v model with regular expression tokernized texts. 

In [16]:
w2v_data = data["text"]
tokenizer = RegexpTokenizer(r'\w+')
w2v_data = list(map(tokenizer.tokenize,w2v_data))

def tolower(sentence):
    sentence = list(map(str.lower, sentence))
    return sentence

w2v_data = list(map(tolower,w2v_data))

In [17]:
# import modules & set up logging
import gensim, logging
from gensim.models import word2vec
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = w2v_data

# train word2vec on the tweets, 100 dimensions
model = gensim.models.Word2Vec(sentences, min_count=1,size=100)
# create w2v dictionary
w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}

2018-04-15 14:52:52,411 : INFO : collecting all words and their counts
2018-04-15 14:52:52,417 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-04-15 14:52:52,492 : INFO : PROGRESS: at sentence #10000, processed 132890 words, keeping 17982 word types
2018-04-15 14:52:52,558 : INFO : PROGRESS: at sentence #20000, processed 279320 words, keeping 29291 word types
2018-04-15 14:52:52,639 : INFO : PROGRESS: at sentence #30000, processed 419620 words, keeping 39877 word types
2018-04-15 14:52:52,734 : INFO : PROGRESS: at sentence #40000, processed 561328 words, keeping 50131 word types
2018-04-15 14:52:52,823 : INFO : PROGRESS: at sentence #50000, processed 701022 words, keeping 59887 word types
2018-04-15 14:52:52,937 : INFO : PROGRESS: at sentence #60000, processed 841149 words, keeping 69750 word types
2018-04-15 14:52:53,092 : INFO : PROGRESS: at sentence #70000, processed 982608 words, keeping 79330 word types
2018-04-15 14:52:53,176 : INFO : PROGRESS: at 

# Create training set and testing set

In [31]:
X_data = data["text"]
y_data = data["label"]
from sklearn.cross_validation import StratifiedShuffleSplit
for train, test in StratifiedShuffleSplit(y_data, n_iter=5, test_size = 0.25):
        X_train, X_test = X_data[train], X_data[test]
        y_train, y_test = y_data[train], y_data[test]

# Other classification models

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = "english",lowercase=True)
O_X_train = vectorizer.fit_transform(X_train)
O_X_test = vectorizer.transform(X_test)

In [11]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,auc
from sklearn.cross_validation import cross_val_score
import scikitplot as skplt
import matplotlib.pyplot as plt

#define a function for performance metrics
def model_eval(model,X_train,y_train, X_test,y_test):
    print (model)
    model.fit(X_train,y_train)
    pred = model.predict(X_test)
    print ("prediciton Accuracy : %f" % accuracy_score(y_test, pred))
    print ("Confusion_matrix : ")
    print (confusion_matrix(y_test,pred))
    print ("classification report : ")
    print (classification_report(y_test, pred, labels=['0', '1']))
        
    if not str(model)[:3] == "SGD":
        pred_proba = model.predict_proba(X_test)
        pred_proba_c1 = pred_proba[:,1]
        print ("AUC Score : %f" % sklearn.metrics.roc_auc_score(y_test, pred_proba_c1))

In [12]:
#all models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier 

NB = MultinomialNB()
LRcv = LogisticRegressionCV(solver="liblinear",penalty = "l1",cv = 5,random_state = 42)  
RF = RandomForestClassifier(n_estimators =100, max_features = "sqrt",bootstrap = True, 
                            oob_score=True,verbose=0,class_weight = "balanced",
                            random_state=42,max_depth = 40)
GBC = GradientBoostingClassifier(learning_rate=0.05, n_estimators=260,max_depth=35, 
                                 min_samples_leaf =3, min_samples_split =150, 
                                 max_features=40, subsample=0.7, random_state=42)
SGD = SGDClassifier(loss="hinge", penalty="l2", alpha=0.0001,           
                    l1_ratio=0.15, fit_intercept=True, random_state = 42,
                    shuffle=True, learning_rate="optimal", 
                    n_iter= np.ceil(10**6 / O_X_train.shape[0])) 

In [84]:
model_eval(NB, O_X_train, y_train, O_X_test,y_test)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
prediciton Accuracy : 0.754309
Confusion_matrix : 
[[466857 124475]
 [166416 426223]]
classification report : 
             precision    recall  f1-score   support

          0       0.74      0.79      0.76    591332
          1       0.77      0.72      0.75    592639

avg / total       0.76      0.75      0.75   1183971

AUC Score : 0.838127


In [87]:
model_eval(LRcv, O_X_train, y_train, O_X_test,y_test)

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l1', random_state=42,
           refit=True, scoring=None, solver='liblinear', tol=0.0001,
           verbose=0)
prediciton Accuracy : 0.770769
Confusion_matrix : 
[[444898 146434]
 [124969 467670]]
classification report : 
             precision    recall  f1-score   support

          0       0.78      0.75      0.77    591332
          1       0.76      0.79      0.78    592639

avg / total       0.77      0.77      0.77   1183971

AUC Score : 0.850202


In [88]:
model_eval(RF, O_X_train, y_train, O_X_test,y_test)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=40, max_features='sqrt',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=True, random_state=42,
            verbose=0, warm_start=False)
prediciton Accuracy : 0.739393
Confusion_matrix : 
[[413675 177657]
 [130894 461745]]
classification report : 
             precision    recall  f1-score   support

          0       0.76      0.70      0.73    591332
          1       0.72      0.78      0.75    592639

avg / total       0.74      0.74      0.74   1183971

AUC Score : 0.812129


In [89]:
model_eval(GBC, O_X_train, y_train, O_X_test,y_test)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=35,
              max_features=40, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=3, min_samples_split=150,
              min_weight_fraction_leaf=0.0, n_estimators=260,
              presort='auto', random_state=42, subsample=0.7, verbose=0,
              warm_start=False)
prediciton Accuracy : 0.705119
Confusion_matrix : 
[[391555 199777]
 [149353 443286]]
classification report : 
             precision    recall  f1-score   support

          0       0.72      0.66      0.69    591332
          1       0.69      0.75      0.72    592639

avg / total       0.71      0.71      0.70   1183971

AUC Score : 0.773016


In [90]:
model_eval(SGD, O_X_train, y_train, O_X_test,y_test)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=3.0,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)
prediciton Accuracy : 0.754985
Confusion_matrix : 
[[408647 182685]
 [107406 485233]]
classification report : 
             precision    recall  f1-score   support

          0       0.79      0.69      0.74    591332
          1       0.73      0.82      0.77    592639

avg / total       0.76      0.75      0.75   1183971



We observe that logistic regression obtain the best result as represented by the accuracy scores and other performance metrics. 


# classify using word2vec result

https://github.com/nadbordrozd/blog_stuff/blob/master/classification_w2v/benchmarking_python3.ipynb
implement an embedding vectorizer - a counterpart of CountVectorizer and TfidfVectorizer - that is given a word -> vector mapping and vectorizes texts by taking the mean of all the vectors corresponding to individual words.

In [9]:
w2v_train = X_train
w2v_train = list(map(tokenizer.tokenize,w2v_train))
w2v_train = list(map(tolower,w2v_train))

w2v_test = X_test
w2v_test = list(map(tokenizer.tokenize,w2v_test))
w2v_test = list(map(tolower,w2v_test))

In [15]:
from collections import Counter, defaultdict
# a tf-idf version
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        if len(word2vec)>0:
            self.dim=100
        else:
            self.dim=0
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [19]:
LRcv_classifier_w2v = LogisticRegressionCV(solver="liblinear",
                                           penalty = "l1",
                                           cv = 5,random_state = 42)  

In [22]:
from sklearn.pipeline import Pipeline
LRcv_w2v = Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)), 
                        ("LRcv", LRcv_classifier_w2v)])

In [136]:
model_eval(LRcv_w2v, w2v_train, y_train, w2v_test,y_test)

Pipeline(memory=None,
     steps=[('word2vec vectorizer', <__main__.TfidfEmbeddingVectorizer object at 0x2c241eb38>), ('LRcv', LogisticRegressionCV(Cs=10, class_weight='balanced', cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l1', random_state=42,
           refit=True, scoring=None, solver='liblinear', tol=0.0001,
           verbose=0))])
prediciton Accuracy : 0.753804
Confusion_matrix : 
[[445699 145633]
 [145856 446783]]
classification report : 
             precision    recall  f1-score   support

          0       0.75      0.75      0.75    591332
          1       0.75      0.75      0.75    592639

avg / total       0.75      0.75      0.75   1183971

AUC Score : 0.832537


# Use CNN from keras

In [44]:
import keras
from keras import utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

tokenizer = Tokenizer()
tokenizer.fit_on_texts(w2v_data)
sequences = tokenizer.texts_to_sequences(w2v_data)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

newdata = pad_sequences(sequences)

Found 683263 unique tokens.


In [10]:
labels = keras.utils.to_categorical(np.asarray(y_data))
print('Shape of data tensor:', newdata.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (1578627, 303)
Shape of label tensor: (1578627, 2)


In [11]:
# split the data into a training set and a validation set
indices = np.arange(newdata.shape[0])
np.random.shuffle(indices)
newdata = newdata[indices]
labels = labels[indices]
nb_validation_samples = int(0.25 * newdata.shape[0])

x_train = newdata[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = newdata[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

## Create the embedding matrix
Create the embedding matrix by using the w2v trained previously. 

In [12]:
print('Found %s word vectors.' % len(w2v))

Found 683263 word vectors.


In [12]:
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = min(1000, newdata.shape[1])

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = w2v.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [24]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=False)

## Training a 1D CNN 

In [15]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(labels.shape[1], activation='softmax')(x)

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


In [None]:
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])

model.fit(x_train, y_train,batch_size=128,epochs=2,validation_data=(x_val, y_val))

Train on 1183971 samples, validate on 394656 samples
Epoch 1/2
 180608/1183971 [===>..........................] - ETA: 47:09 - loss: 0.5567 - acc: 0.7117  

The algorithm stop running probably due to the large data size. We try to use a smaller dataset by randoming choosing 10% of the data. 

In [17]:
small_samples = int(0.1* newdata.shape[0])
x_small = newdata[-small_samples:]
y_small = labels[-small_samples:]

nb_validation_samples = int(0.25 * x_small.shape[0])
x_train = x_small[:-nb_validation_samples]
y_train = y_small[:-nb_validation_samples]
x_val = x_small[-nb_validation_samples:]
y_val = y_small[-nb_validation_samples:]
print(x_train.shape)

(118397, 303)


In [48]:
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])

model.fit(x_train, y_train,batch_size=128,epochs=2,validation_data=(x_val, y_val))

Train on 118397 samples, validate on 39465 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x13be81048>

## Use pretrained embedding 
from Glove: glove.twitter.27B.100 dimensions 

In [13]:
%cd /Users/Allen/Documents/NLP/package/glove.twitter.27B
encoding="utf-8"
glove_twitter_path = "glove.twitter.27B.100d.txt"

with open(glove_twitter_path, "rb") as lines:
    wvec = {line.split()[0].decode(encoding): np.array(line.split()[1:],dtype=np.float32)
               for line in lines}
import struct 
glove_twitter = {}
all_words = set(w for words in w2v_data for w in words)
with open(glove_twitter_path, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        if (word in all_words):
            nums=np.array(parts[1:], dtype=np.float32)
            glove_twitter[word] = nums

/Users/Allen/Documents/NLP/package/glove.twitter.27B


Train on the same subset of data, manually run for 6 epochs 

In [14]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = glove_twitter.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
from keras.layers import Embedding        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=False)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(labels.shape[1], activation='softmax')(x)

In [21]:
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])
model.fit(x_train, y_train, batch_size=128, epochs=2,validation_data=(x_val, y_val))

Train on 118397 samples, validate on 39465 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x28e84f6a0>

In [22]:
model.fit(x_train, y_train, batch_size=128, epochs=2,validation_data=(x_val, y_val))

Train on 118397 samples, validate on 39465 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x28ec98438>

In [23]:
model.fit(x_train, y_train, batch_size=128, epochs=2,validation_data=(x_val, y_val))

Train on 118397 samples, validate on 39465 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x28e84f1d0>

# Reference
- traditional classification methods by word embedding
http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/
https://github.com/nadbordrozd/blog_stuff/blob/master/classification_w2v/benchmarking_python3.ipynb

- keras CNN
https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py