In [9]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

import re

# Reading train data

In [10]:
train = pd.read_csv('data/train.csv')
train.head()

Unnamed: 0.1,Unnamed: 0,author,content,sentiment
0,26898,richardepryor,"@treasaint salad stuff, some chillis, whatever...",happiness
1,27635,reese,"@sunnyjamiel sunny, I'm a workin' on it. It's ...",neutral
2,3036,mutedriposte,@jolynnchew so early??,surprise
3,5604,sakizzie_1102,"So now, I have conjunctivitis in my left eye. ...",sadness
4,36111,poptrash,"Out and about in Deal, Kent. More sunshine req...",love


### Encoding target

In [11]:
X_train = train['content'].values
y_train = train['sentiment'].values
y_train

array(['happiness', 'neutral', 'surprise', ..., 'fun', 'hate', 'love'],
      dtype=object)

In [12]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_train

array([ 5,  8, 11, ...,  4,  6,  7])

In [13]:
def clean_text(X):
    result = X.copy()
    for i, text in enumerate(result):
        result[i] = re.sub(r"@[^\s]*", '', text)
    return result

In [14]:
print(X_train[0])
X_train = clean_text(X_train)
print(X_train[0])

@treasaint salad stuff, some chillis, whatever my horti mate gives me really, think it will be fun to do
 salad stuff, some chillis, whatever my horti mate gives me really, think it will be fun to do


### Testing classifiers

In [50]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(random_state=0)),
])

In [55]:
best_params = {
    'tfidf__norm': 'l2', 
    'tfidf__smooth_idf': True, 
    'tfidf__sublinear_tf': False, 
    'tfidf__use_idf': False, 
    'vect__stop_words': None,
    'clf__C': 2.0, 
    'clf__penalty': 'l2',
    'clf__solver': 'lbfgs',
}
pipeline.set_params(**best_params)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...  penalty='l2', random_state=0, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False))])

In [56]:
params = {
#     'vect__analyzer': ['word', 'char', 'char_wb'],
#     'vect__stop_words': [None, 'english'],
#     'tfidf__norm': ['l1', 'l2'],
#     'tfidf__use_idf': [True, False],
#     'tfidf__smooth_idf': [True, False],
#     'tfidf__sublinear_tf': [True, False],
#     'clf__penalty': ['l1', 'l2'],
    'clf__C': np.arange(1, 5, 0.5),
#     'clf__class_weight': [None, 'balanced'],
#     'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#     'clf__multi_class': ['ovr', 'multinomial'],
}

In [57]:
grid_search = GridSearchCV(pipeline, 
                           params, 
                           scoring='accuracy', 
                           n_jobs=-2, 
                           cv=5, 
                           verbose=2, 
                           return_train_score=False)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] clf__C=1.0 ......................................................
[CV] clf__C=1.0 ......................................................
[CV] clf__C=1.0 ......................................................
[CV] clf__C=1.0 ......................................................
[CV] clf__C=1.0 ......................................................
[CV] clf__C=1.5 ......................................................
[CV] clf__C=1.5 ......................................................
[CV] ....................................... clf__C=1.0, total=  13.2s
[CV] clf__C=1.5 ......................................................
[CV] ....................................... clf__C=1.0, total=  13.9s
[CV] clf__C=1.5 ......................................................
[CV] ....................................... clf__C=1.0, total=  14.0s
[CV] ....................................... clf__C=1.0, total=  14.0s
[CV] clf__C=1.5 .

[Parallel(n_jobs=-2)]: Done  27 tasks      | elapsed:  1.1min


[CV] ....................................... clf__C=3.5, total=  18.6s
[CV] clf__C=4.0 ......................................................
[CV] ....................................... clf__C=3.5, total=  16.7s
[CV] clf__C=4.5 ......................................................
[CV] ....................................... clf__C=3.5, total=  16.6s
[CV] clf__C=4.5 ......................................................
[CV] ....................................... clf__C=4.0, total=  16.7s
[CV] clf__C=4.5 ......................................................
[CV] ....................................... clf__C=4.0, total=  17.0s
[CV] clf__C=4.5 ......................................................
[CV] ....................................... clf__C=4.0, total=  17.4s
[CV] clf__C=4.5 ......................................................
[CV] ....................................... clf__C=4.0, total=  17.3s
[CV] ....................................... clf__C=4.0, total=  16.6s
[CV] .

[Parallel(n_jobs=-2)]: Done  40 out of  40 | elapsed:  1.6min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...  penalty='l2', random_state=0, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-2,
       param_grid={'clf__C': array([1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='accuracy', verbose=2)

In [58]:
print(grid_search.best_score_)
print(grid_search.best_params_)
grid_search.best_estimator_

0.3549
{'clf__C': 2.0}


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...  penalty='l2', random_state=0, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False))])

In [59]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_clf__C,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,std_fit_time,std_score_time,std_test_score
0,13.690486,0.174507,0.3523,1.0,{'clf__C': 1.0},5,0.357083,0.362227,0.346724,0.350117,0.345339,0.384889,0.037866,0.006414
1,14.456901,0.165663,0.353867,1.5,{'clf__C': 1.5},3,0.36008,0.364227,0.347891,0.348449,0.348674,0.460494,0.003761,0.006899
2,15.120656,0.171945,0.3549,2.0,{'clf__C': 2.0},1,0.360246,0.365894,0.349392,0.348283,0.350675,0.516921,0.043166,0.006949
3,15.831372,0.163036,0.3542,2.5,{'clf__C': 2.5},2,0.362078,0.365061,0.350225,0.344948,0.348674,0.309636,0.005008,0.007899
4,17.395006,0.193604,0.353633,3.0,{'clf__C': 3.0},4,0.360579,0.364561,0.351225,0.342447,0.349341,0.42683,0.02949,0.007962
5,17.443589,0.179183,0.352233,3.5,{'clf__C': 3.5},6,0.358249,0.36256,0.351225,0.340614,0.348508,0.806218,0.019798,0.007651
6,16.812093,0.186742,0.350867,4.0,{'clf__C': 4.0},7,0.355752,0.361727,0.349058,0.340447,0.34734,0.293907,0.039737,0.007295
7,14.680712,0.100489,0.348833,4.5,{'clf__C': 4.5},8,0.355086,0.358393,0.347891,0.336612,0.346173,0.503196,0.033044,0.007587


# Final Classifier

In [None]:
estimator = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(random_state=0)),
])
best_params = {
    'tfidf__norm': 'l2', 
    'tfidf__smooth_idf': True, 
    'tfidf__sublinear_tf': False, 
    'tfidf__use_idf': False, 
    'vect__stop_words': None,
    'clf__C': 2.154434690031882, 
    'clf__penalty': 'l2',
    'clf__solver': 'lbfgs',
}
estimator.set_params(**best_params)
estimator.fit(X_train, y_train)

# Test data estimation

In [None]:
test = pd.read_csv('./data/test.csv')
X_test = test['content'].values

In [None]:
X_test = clean_text(X_test)

In [None]:
test_predict = estimator.predict(X_test)
test_predict = label_encoder.inverse_transform(test_predict)
print(test_predict)

In [None]:
test['sentiment'] = test_predict
test[['id', 'sentiment']].to_csv('new.csv', sep=',', index=False)