In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 利用pandas读取dataframe数据
data_root = '/home/henry/nlp-beginner/data/'
train_data=pd.read_csv(data_root+'train.tsv',sep='\t')
# test_data=pd.read_csv(data_root+'test.tsv',sep='\t')
pd.set_option('display.width', 900)
x_all = train_data['Phrase']
y_all = train_data['Sentiment']

train_x, test_x, train_y, test_y = train_test_split(x_all, y_all, test_size=0.2)
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.25)

print(train_x.shape, val_x.shape, test_x.shape)

## Extract Features from Document

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(train_x)
x_test_counts = count_vect.transform(test_x)

print(x_train_counts.shape,x_test_counts.shape)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_transformer = TfidfVectorizer(analyzer='word', max_features=50000)
tfidf_transformer.fit(train_x)

x_train_tfidf_word = tfidf_transformer.transform(train_x)
x_test_tfidf_word = tfidf_transformer.transform(test_x)
print(x_train_tfidf_word.shape, x_test_tfidf_word.shape)
print(x_train_tfidf_word[:5], x_test_tfidf_word[:5])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_transformer = TfidfVectorizer(analyzer='word', ngram_range=(2,3),max_features=50000)
tfidf_transformer.fit(train_x)
x_train_tfidf_ngram = tfidf_transformer.transform(train_x)
x_test_tfidf_ngram = tfidf_transformer.transform(test_x)

print(x_train_tfidf_ngram.shape, x_test_tfidf_ngram.shape)

## Concatenate Features to Form Inputs

In [None]:
from scipy.sparse import hstack

train_features = hstack([x_train_counts, x_train_tfidf_word, x_train_tfidf_ngram])
test_features = hstack([x_test_counts, x_test_tfidf_word, x_test_tfidf_ngram])

print(train_features.shape)

## Train a SGD Classifier and Explore some Hyperparameters

In [None]:
from sklearn.linear_model import SGDClassifier

max_iters = [100, 200, 500 ,1000]
lr_rates = [i*1e-4 for i in range(10)]

for lr_rate in lr_rates:
    for max_iter in max_iters:
        clf = SGDClassifier(alpha=lr_rate,loss="log",early_stopping=True,eta0=0.001,learning_rate='adaptive',max_iter=max_iter)
        clf.fit(train_features, train_y)
        predict = clf.predict(test_features)
        print("alpha {0} max_iter {1}:{2}".format(lr_rate, max_iter,np.mean(predict == test_y)))