In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

import re

# Reading train data

In [2]:
train = pd.read_csv('data/train.csv')
train.head()

Unnamed: 0.1,Unnamed: 0,author,content,sentiment
0,26898,richardepryor,"@treasaint salad stuff, some chillis, whatever...",happiness
1,27635,reese,"@sunnyjamiel sunny, I'm a workin' on it. It's ...",neutral
2,3036,mutedriposte,@jolynnchew so early??,surprise
3,5604,sakizzie_1102,"So now, I have conjunctivitis in my left eye. ...",sadness
4,36111,poptrash,"Out and about in Deal, Kent. More sunshine req...",love


### Encoding target

In [3]:
X_train = train['content'].values
y_train = train['sentiment'].values
y_train

array(['happiness', 'neutral', 'surprise', ..., 'fun', 'hate', 'love'],
      dtype=object)

In [4]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
# y_train = label_encoder.inverse_transform(y_train)
y_train

array([ 5,  8, 11, ...,  4,  6,  7])

In [5]:
# def clean_text(X):
#     result = X.copy()
#     for i, text in enumerate(result):
#         if i < 20:
#             print(text)
#         result[i] = re.sub(r"@[^\s]*", '', text)
#     return result

In [6]:
# print(X_train[0])
# X_train = clean_text(X_train)
# print(X_train[0])

### Vectorizing texts

In [7]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(random_state=0)),
])

In [8]:
best_params = {
    'tfidf__norm': 'l2', 
    'tfidf__smooth_idf': True, 
    'tfidf__sublinear_tf': False, 
    'tfidf__use_idf': False, 
    'vect__stop_words': None,
    
}
pipeline.set_params(**best_params)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [17]:
params = {
#     'vect__analyzer': ['word', 'char', 'char_wb'],
#     'vect__stop_words': [None, 'english'],
#     'tfidf__norm': ['l1', 'l2'],
#     'tfidf__use_idf': [True, False],
#     'tfidf__smooth_idf': [True, False],
#     'tfidf__sublinear_tf': [True, False],
    'clf__penalty': ['l1', 'l2'],
    'clf__C': np.geomspace(1e-3, 1000, 10),
    'clf__class_weight': [None, 'balanced']
}

In [None]:
grid_search = GridSearchCV(pipeline, 
                           params, 
                           scoring='accuracy', 
                           n_jobs=-2, 
                           cv=5, 
                           verbose=1, 
                           return_train_score=False)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:   13.1s


In [None]:
print(grid_search.best_score_)
print(grid_search.best_params_)
grid_search.best_estimator_

In [None]:
pd.DataFrame(grid_search.cv_results_)