In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

data_root = '/home/henry/nlp-beginner/data/'
train_data=pd.read_csv(data_root+'train.tsv',sep='\t')
# test_data=pd.read_csv(data_root+'test.tsv',sep='\t')
pd.set_option('display.width', 900)
x_all = train_data['Phrase']
y_all = train_data['Sentiment']
# test_x = test_data['Phrase']
# test_y = test_data['Sentiment']

train_x, test_x, train_y, test_y = train_test_split(x_all, y_all, test_size=0.2)
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.25)

print(train_x.shape, val_x.shape, test_x.shape)

(93636,) (31212,) (31212,)


## Extract Features from document

In [6]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(train_x)
x_test_counts = count_vect.transform(test_x)

print(x_train_counts.shape,x_test_counts.shape)
print(x_train_counts[:5],x_test_counts[:5]) # stored as sparse matrix

(93636, 15202) (31212, 15202)
  (0, 3912)	1
  (0, 9215)	1
  (0, 8789)	1
  (0, 8707)	1
  (0, 13462)	1
  (0, 13471)	1
  (0, 11807)	1
  (1, 13471)	2
  (1, 3712)	1
  (1, 9186)	1
  (1, 11236)	1
  (1, 13647)	1
  (1, 7795)	1
  (1, 9208)	1
  (1, 8222)	1
  (1, 2644)	1
  (1, 1872)	1
  (2, 4644)	1
  (2, 14594)	1
  (2, 7333)	1
  (3, 13471)	1
  (3, 4654)	1
  (3, 501)	1
  (3, 7652)	1
  (3, 6034)	1
  (4, 13471)	1
  (4, 13647)	1
  (4, 9208)	2
  (4, 1086)	1
  (4, 1777)	1
  (4, 6194)	1
  (4, 13272)	1
  (4, 603)	1
  (4, 12267)	1
  (4, 12218)	1
  (4, 6371)	1
  (4, 622)	1
  (4, 9272)	1
  (4, 7212)	1
  (4, 14358)	1
  (4, 5310)	1
  (4, 504)	1
  (4, 14398)	1
  (4, 14130)	1   (0, 9208)	1
  (0, 11271)	1
  (0, 14542)	1
  (1, 2790)	1
  (2, 8209)	1
  (2, 10016)	1
  (2, 13684)	1
  (3, 1253)	1
  (3, 5803)	1
  (3, 6175)	1
  (4, 4988)	1
  (4, 7730)	1


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_transformer = TfidfVectorizer(analyzer='word', max_features=50000)
tfidf_transformer.fit(train_x)

x_train_tfidf_word = tfidf_transformer.transform(train_x)
x_test_tfidf_word = tfidf_transformer.transform(test_x)
print(x_train_tfidf_word.shape, x_test_tfidf_word.shape)
print(x_train_tfidf_word[:5], x_test_tfidf_word[:5])

(93636, 15202) (31212, 15202)
  (0, 13471)	0.15744717193454452
  (0, 13462)	0.3214635921744103
  (0, 11807)	0.47502903085533665
  (0, 9215)	0.5291820062818362
  (0, 8789)	0.3621621708326473
  (0, 8707)	0.31493430684174945
  (0, 3912)	0.3685652118963482
  (1, 13647)	0.13810284551177524
  (1, 13471)	0.21354343692677416
  (1, 11236)	0.4020670464818833
  (1, 9208)	0.12262172931488188
  (1, 9186)	0.35111665318267493
  (1, 8222)	0.4105703108614773
  (1, 7795)	0.3258121887911673
  (1, 3712)	0.31732943589616697
  (1, 2644)	0.46880394844367196
  (1, 1872)	0.20038270037524197
  (2, 14594)	0.5535242806793065
  (2, 7333)	0.6639882823402067
  (2, 4644)	0.5027230167929039
  (3, 13471)	0.13137606960749482
  (3, 7652)	0.5702814395241158
  (3, 6034)	0.4611397609398899
  (3, 4654)	0.44735078856811167
  (3, 501)	0.49471891101571863
  (4, 14398)	0.2104752989856234
  (4, 14358)	0.18608171954649474
  (4, 14130)	0.2881469672641313
  (4, 13647)	0.10978223181707916
  (4, 13471)	0.08487614794914587
  (4, 13272)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_transformer = TfidfVectorizer(analyzer='word', ngram_range=(2,3),max_features=50000)
tfidf_transformer.fit(train_x)
x_train_tfidf_ngram = tfidf_transformer.transform(train_x)
x_test_tfidf_ngram = tfidf_transformer.transform(test_x)

print(x_train_tfidf_ngram.shape, x_test_tfidf_ngram.shape)

(93636, 80000) (31212, 80000)


In [13]:
from scipy.sparse import hstack

train_features = hstack([x_train_counts, x_train_tfidf_word, x_train_tfidf_ngram])
test_features = hstack([x_test_counts, x_test_tfidf_word, x_test_tfidf_ngram])

print(train_features.shape)

(93636, 110404)


In [14]:
from sklearn.linear_model import SGDClassifier

max_iters = [100, 200, 500 ,1000]
lr_rates = [i*1e-4 for i in range(10)]

for lr_rate in lr_rates:
    for max_iter in max_iters:
        clf = SGDClassifier(alpha=lr_rate,loss="log",early_stopping=True,eta0=0.001,learning_rate='adaptive',max_iter=max_iter)
        clf.fit(train_features, train_y)
        predict = clf.predict(test_features)
        print("alpha {0} max_iter {1}:{2}".format(lr_rate, max_iter,np.mean(predict == test_y)))

alpha 0.0 max_iter 100:0.5562283737024222
alpha 0.0 max_iter 200:0.5585992566961425
alpha 0.0 max_iter 500:0.5560361399461745
alpha 0.0 max_iter 1000:0.5553633217993079
alpha 0.0001 max_iter 100:0.5580866333461489
alpha 0.0001 max_iter 200:0.5543060361399462
alpha 0.0001 max_iter 500:0.5561002178649237
alpha 0.0001 max_iter 1000:0.5560681789055492
alpha 0.0002 max_iter 100:0.556164295783673
alpha 0.0002 max_iter 200:0.553280789439959
alpha 0.0002 max_iter 500:0.555683711393054
alpha 0.0002 max_iter 1000:0.5553633217993079
alpha 0.00030000000000000003 max_iter 100:0.5530565167243368
alpha 0.00030000000000000003 max_iter 200:0.5523516596180956
alpha 0.00030000000000000003 max_iter 500:0.5547545815711906
alpha 0.00030000000000000003 max_iter 1000:0.5534730231962066
alpha 0.0004 max_iter 100:0.5517108804306036
alpha 0.0004 max_iter 200:0.5527681660899654
alpha 0.0004 max_iter 500:0.550237088299372
alpha 0.0004 max_iter 1000:0.5515827245931052
alpha 0.0005 max_iter 100:0.5506856337306164
al