In [0]:
!wget -q https://l1nna.com/372/Assignment/A2-3/train.csv
!wget -q https://l1nna.com/372/Assignment/A2-3/test.csv

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
from xgboost.sklearn import XGBClassifier


xy_train = pd.read_csv('train.csv')
x_test  = pd.read_csv('test.csv')


In [0]:
x = xy_train.review
y = xy_train.rating


pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SVC(class_weight='balanced')),
])


parameters = {
    'vect__max_features': [100, 500, 1000, 5000, 10000, 120000],
    'vect__analyzer': ['word',],
    'vect__ngram_range': ((1, 2),(1, 3)), # unigrams or bigrams or trigrams etc
    'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1', 'l2'),
#     'clf__max_iter': (20,),
#     'clf__alpha': (0.00001, 0.000001),
#     'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}

scoring = ['f1', 'accuracy']
split = int(len(x) * 0.8)
grid_search = GridSearchCV(
    pipeline, parameters, verbose=3, cv=[(np.arange(0, split), np.arange(split, len(x)))], 
    refit='f1', n_jobs=20, scoring=scoring, return_train_score=True)
grid_search.fit(x, y)

In [0]:
# let's visualize hyperparameters against performance

from matplotlib import pyplot as plt

selected_parameter = 'vect__max_features'
results = grid_search.cv_results_

plt.figure()
plt.title("GridSearchCV",
          fontsize=16)

plt.xlabel(selected_parameter)
plt.ylabel("Score")

ax = plt.gca()
ax.set_ylim(0.4, 1.1)


# Get the regular numpy array from the MaskedArray
X_axis = np.array(results['param_'+ selected_parameter].data, dtype=float)

scorer = 'f1'
color ='b'
for sample, style in (('train', '--'), ('test', '-')):
    sample_score_mean = results['mean_%s_%s' % (sample, scorer)]
    sample_score_mean = [x for _,x in sorted(zip(X_axis,sample_score_mean))]
    ax.plot(sorted(X_axis), sample_score_mean, style, color=color,
            alpha=1 if sample == 'test' else 0.7,
            label="%s (%s)" % (scorer, sample if sample == 'train' else 'validation'))

best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0]
best_score = results['mean_test_%s' % scorer][best_index]

# Plot a dotted vertical line at the best score for that scorer marked by x
ax.plot([X_axis[best_index], ] * 2, [0, best_score],
        linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8)

# Annotate the best score for that scorer
ax.annotate("%0.2f" % best_score,
            (X_axis[best_index], best_score + 0.005))
    

plt.legend(loc="best")
plt.grid(False)
plt.show()

In [0]:
# generate submission

y_predict = np.squeeze(grid_search.predict(x_test.review))

pd.DataFrame(
    {'id': x_test.id, 'rating':y_predict}).to_csv('sample_submission.csv', index=False)