In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


SEED = 1337


df = pd.read_csv('Tweets.csv')

In [7]:
# we can notice that negative < neutral < positive
# let's encode that appropreately
df.loc[df.airline_sentiment == 'negative', 'airline_sentiment'] = 0
df.loc[df.airline_sentiment == 'neutral', 'airline_sentiment'] = 1
df.loc[df.airline_sentiment == 'positive', 'airline_sentiment'] = 2

In [8]:
# encode airline as categorial variable
airline_le = LabelEncoder()
df['airline'] = airline_le.fit_transform(df.airline)
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,retweet_count,text,text_len
0,570306133677760513,1,5,0,@VirginAmerica What @dhepburn said.,35
1,570301130888122368,2,5,0,@VirginAmerica plus you've added commercials t...,72
2,570301083672813571,1,5,0,@VirginAmerica I didn't today... Must mean I n...,71
3,570301031407624196,0,5,0,@VirginAmerica it's really aggressive to blast...,126
4,570300817074462722,0,5,0,@VirginAmerica and it's a really big bad thing...,55


In [9]:
y = df.airline_sentiment.values
df_train, df_test, y_train, y_test = train_test_split(df, y, test_size=0.25, 
                                                                      stratify=y,
                                                                      random_state=SEED, 
                                                                      shuffle=True)

print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 10980
test 3660


## Final model

In [10]:
%%time

#best params
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV

parameters = {'Cs':[5, 10, 15], 'solver':('newton-cg', 'sag', 'lbfgs'), 
              'multi_class':('ovr', 'multinomial')}

tfidf = TfidfVectorizer(analyzer='char', use_idf=True, smooth_idf=True,
                                  lowercase=True, 
                                  stop_words=stopwords.words('english'),
                                  min_df=5,
                                  ngram_range=(1,3), norm='l2')

X_train = tfidf.fit_transform(df_train.text)

lr = LogisticRegressionCV()
clf = GridSearchCV(lr, parameters, verbose=2)
clf.fit(X_train, y_train)

print(clf.best_score_)
print(clf.best_params_)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] Cs=5, multi_class=ovr, solver=newton-cg .........................
[CV] .......... Cs=5, multi_class=ovr, solver=newton-cg, total= 1.2min
[CV] Cs=5, multi_class=ovr, solver=newton-cg .........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s


[CV] .......... Cs=5, multi_class=ovr, solver=newton-cg, total= 1.2min
[CV] Cs=5, multi_class=ovr, solver=newton-cg .........................
[CV] .......... Cs=5, multi_class=ovr, solver=newton-cg, total= 1.2min
[CV] Cs=5, multi_class=ovr, solver=sag ...............................




[CV] ................ Cs=5, multi_class=ovr, solver=sag, total= 1.0min
[CV] Cs=5, multi_class=ovr, solver=sag ...............................




[CV] ................ Cs=5, multi_class=ovr, solver=sag, total= 1.0min
[CV] Cs=5, multi_class=ovr, solver=sag ...............................




[CV] ................ Cs=5, multi_class=ovr, solver=sag, total= 1.2min
[CV] Cs=5, multi_class=ovr, solver=lbfgs .............................
[CV] .............. Cs=5, multi_class=ovr, solver=lbfgs, total=  38.9s
[CV] Cs=5, multi_class=ovr, solver=lbfgs .............................
[CV] .............. Cs=5, multi_class=ovr, solver=lbfgs, total=  39.7s
[CV] Cs=5, multi_class=ovr, solver=lbfgs .............................
[CV] .............. Cs=5, multi_class=ovr, solver=lbfgs, total=  40.0s
[CV] Cs=5, multi_class=multinomial, solver=newton-cg .................
[CV] .. Cs=5, multi_class=multinomial, solver=newton-cg, total= 1.3min
[CV] Cs=5, multi_class=multinomial, solver=newton-cg .................
[CV] .. Cs=5, multi_class=multinomial, solver=newton-cg, total= 1.4min
[CV] Cs=5, multi_class=multinomial, solver=newton-cg .................
[CV] .. Cs=5, multi_class=multinomial, solver=newton-cg, total= 1.4min
[CV] Cs=5, multi_class=multinomial, solver=sag .......................




[CV] ........ Cs=5, multi_class=multinomial, solver=sag, total=  35.7s
[CV] Cs=5, multi_class=multinomial, solver=sag .......................




[CV] ........ Cs=5, multi_class=multinomial, solver=sag, total=  33.6s
[CV] Cs=5, multi_class=multinomial, solver=sag .......................




[CV] ........ Cs=5, multi_class=multinomial, solver=sag, total=  29.6s
[CV] Cs=5, multi_class=multinomial, solver=lbfgs .....................
[CV] ...... Cs=5, multi_class=multinomial, solver=lbfgs, total=  58.1s
[CV] Cs=5, multi_class=multinomial, solver=lbfgs .....................
[CV] ...... Cs=5, multi_class=multinomial, solver=lbfgs, total=  48.9s
[CV] Cs=5, multi_class=multinomial, solver=lbfgs .....................
[CV] ...... Cs=5, multi_class=multinomial, solver=lbfgs, total=  44.2s
[CV] Cs=10, multi_class=ovr, solver=newton-cg ........................
[CV] ......... Cs=10, multi_class=ovr, solver=newton-cg, total= 1.7min
[CV] Cs=10, multi_class=ovr, solver=newton-cg ........................
[CV] ......... Cs=10, multi_class=ovr, solver=newton-cg, total= 1.9min
[CV] Cs=10, multi_class=ovr, solver=newton-cg ........................
[CV] ......... Cs=10, multi_class=ovr, solver=newton-cg, total= 1.8min
[CV] Cs=10, multi_class=ovr, solver=sag ..............................




[CV] ............... Cs=10, multi_class=ovr, solver=sag, total= 1.7min
[CV] Cs=10, multi_class=ovr, solver=sag ..............................




[CV] ............... Cs=10, multi_class=ovr, solver=sag, total= 1.7min
[CV] Cs=10, multi_class=ovr, solver=sag ..............................




[CV] ............... Cs=10, multi_class=ovr, solver=sag, total= 1.7min
[CV] Cs=10, multi_class=ovr, solver=lbfgs ............................
[CV] ............. Cs=10, multi_class=ovr, solver=lbfgs, total= 1.3min
[CV] Cs=10, multi_class=ovr, solver=lbfgs ............................
[CV] ............. Cs=10, multi_class=ovr, solver=lbfgs, total= 1.3min
[CV] Cs=10, multi_class=ovr, solver=lbfgs ............................
[CV] ............. Cs=10, multi_class=ovr, solver=lbfgs, total= 1.2min
[CV] Cs=10, multi_class=multinomial, solver=newton-cg ................
[CV] . Cs=10, multi_class=multinomial, solver=newton-cg, total= 1.9min
[CV] Cs=10, multi_class=multinomial, solver=newton-cg ................
[CV] . Cs=10, multi_class=multinomial, solver=newton-cg, total= 2.1min
[CV] Cs=10, multi_class=multinomial, solver=newton-cg ................
[CV] . Cs=10, multi_class=multinomial, solver=newton-cg, total= 2.1min
[CV] Cs=10, multi_class=multinomial, solver=sag ......................




[CV] ....... Cs=10, multi_class=multinomial, solver=sag, total= 1.3min
[CV] Cs=10, multi_class=multinomial, solver=sag ......................




[CV] ....... Cs=10, multi_class=multinomial, solver=sag, total= 1.1min
[CV] Cs=10, multi_class=multinomial, solver=sag ......................




[CV] ....... Cs=10, multi_class=multinomial, solver=sag, total=  58.2s
[CV] Cs=10, multi_class=multinomial, solver=lbfgs ....................
[CV] ..... Cs=10, multi_class=multinomial, solver=lbfgs, total= 1.4min
[CV] Cs=10, multi_class=multinomial, solver=lbfgs ....................
[CV] ..... Cs=10, multi_class=multinomial, solver=lbfgs, total= 1.4min
[CV] Cs=10, multi_class=multinomial, solver=lbfgs ....................
[CV] ..... Cs=10, multi_class=multinomial, solver=lbfgs, total= 1.4min
[CV] Cs=15, multi_class=ovr, solver=newton-cg ........................
[CV] ......... Cs=15, multi_class=ovr, solver=newton-cg, total= 2.1min
[CV] Cs=15, multi_class=ovr, solver=newton-cg ........................
[CV] ......... Cs=15, multi_class=ovr, solver=newton-cg, total= 2.2min
[CV] Cs=15, multi_class=ovr, solver=newton-cg ........................
[CV] ......... Cs=15, multi_class=ovr, solver=newton-cg, total= 2.1min
[CV] Cs=15, multi_class=ovr, solver=sag ..............................




[CV] ............... Cs=15, multi_class=ovr, solver=sag, total= 2.4min
[CV] Cs=15, multi_class=ovr, solver=sag ..............................




[CV] ............... Cs=15, multi_class=ovr, solver=sag, total= 2.2min
[CV] Cs=15, multi_class=ovr, solver=sag ..............................




[CV] ............... Cs=15, multi_class=ovr, solver=sag, total= 2.3min
[CV] Cs=15, multi_class=ovr, solver=lbfgs ............................
[CV] ............. Cs=15, multi_class=ovr, solver=lbfgs, total= 1.5min
[CV] Cs=15, multi_class=ovr, solver=lbfgs ............................
[CV] ............. Cs=15, multi_class=ovr, solver=lbfgs, total= 1.6min
[CV] Cs=15, multi_class=ovr, solver=lbfgs ............................
[CV] ............. Cs=15, multi_class=ovr, solver=lbfgs, total= 1.6min
[CV] Cs=15, multi_class=multinomial, solver=newton-cg ................
[CV] . Cs=15, multi_class=multinomial, solver=newton-cg, total= 2.2min
[CV] Cs=15, multi_class=multinomial, solver=newton-cg ................
[CV] . Cs=15, multi_class=multinomial, solver=newton-cg, total= 2.4min
[CV] Cs=15, multi_class=multinomial, solver=newton-cg ................
[CV] . Cs=15, multi_class=multinomial, solver=newton-cg, total= 2.3min
[CV] Cs=15, multi_class=multinomial, solver=sag ......................




[CV] ....... Cs=15, multi_class=multinomial, solver=sag, total= 1.4min
[CV] Cs=15, multi_class=multinomial, solver=sag ......................




[CV] ....... Cs=15, multi_class=multinomial, solver=sag, total= 1.5min
[CV] Cs=15, multi_class=multinomial, solver=sag ......................




[CV] ....... Cs=15, multi_class=multinomial, solver=sag, total= 1.4min
[CV] Cs=15, multi_class=multinomial, solver=lbfgs ....................
[CV] ..... Cs=15, multi_class=multinomial, solver=lbfgs, total= 2.5min
[CV] Cs=15, multi_class=multinomial, solver=lbfgs ....................
[CV] ..... Cs=15, multi_class=multinomial, solver=lbfgs, total= 3.0min
[CV] Cs=15, multi_class=multinomial, solver=lbfgs ....................
[CV] ..... Cs=15, multi_class=multinomial, solver=lbfgs, total= 2.7min


[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 82.4min finished


0.797449908925
{'Cs': 15, 'multi_class': 'ovr', 'solver': 'newton-cg'}
Wall time: 1h 27min 6s


In [14]:
model = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='char', use_idf=True, smooth_idf=True,
                                  lowercase=True, 
                                  stop_words=stopwords.words('english'),
                                  min_df=7,
                                  ngram_range=(1,3), norm='l2')),
    ('lr', LogisticRegressionCV(Cs=15, 
                                    cv=5, 
                                    solver='newton-cg',
                                    scoring='f1_macro', 
                                    n_jobs=-1, 
                                    multi_class='ovr', random_state=SEED))
])

model.fit(df_train.text, y_train)
print('train', metrics.f1_score(y_train, model.predict(df_train.text), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test.text), average='macro'))

train 0.892099939116
test 0.73595013939
