# Grid search on daily text data, target = Brandwatch sentiment, no shift, 2018 - 2020

- Early effort to predict Brandwatch social media sentiment from daily aggregated articles with no shift
- Resample data set was too small, at just only 366 rows
- As a result, models badly overfit

# Imports

In [1]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import seaborn as sns
import re


from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import SparsePCA
import spacy
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier

# Reading and shaping data

In [38]:
text = pd.read_csv('/floyd/home/Capstone/cap_notebooks/data/master_data_set/text_with_tokens_52k.csv')
#convert date to datetime object
text['date'] = pd.to_datetime(text['date'])

#create day groupby object
grouped_text = text.groupby([text['date'].dt.year, text['date'].dt.month, text['date'].dt.day])

#aggregating tokens by day
text_day_grouped = grouped_text['text_token'].agg(lambda column: "".join(column))

#set as df
text_day_grouped = pd.DataFrame(text_day_grouped)

#rename index
text_day_grouped = text_day_grouped.rename_axis(index=['year', 'month', 'day'])

#reset_index
text_day_grouped = text_day_grouped.reset_index()

#create datetime object col
text_day_grouped['date_grouped'] = pd.to_datetime(text_day_grouped[['year', 'month', 'day']])

In [39]:
#review text data
text_day_grouped.head()

Unnamed: 0,year,month,day,text_token,date_grouped
0,2015,3,2,"['answer', 'resounding', 'myriad', 'claim', 'e...",2015-03-02
1,2015,3,3,"['scientist', 'center', 'controversy', 'fossil...",2015-03-03
2,2015,3,4,"['scientist', 'step', 'closer', 'understand', ...",2015-03-04
3,2015,3,5,"['high', 'blessed', 'relief', 'finally', 'pres...",2015-03-05
4,2015,3,6,"['california', 'lead', 'nation', 'take', 'acti...",2015-03-06


In [31]:
#read in Brandwatch social media sentiment data
sentiment = pd.read_csv('/floyd/home/Capstone/cap_notebooks/data/brandwatch/bw_sentiment_emotion_day/bw_sentiment_2018-2020.csv')

#drop unnecessary columns
sentiment.drop('Unnamed: 0', axis=1, inplace=True)

#create datetime object column
sentiment['days'] = pd.to_datetime(sentiment['days'])

In [35]:
#review sentiment data
sentiment.head()

Unnamed: 0,days,sentiment
0,2018-10-05,-1.119873
1,2018-10-06,-0.847089
2,2018-10-07,-1.485399
3,2018-10-08,-0.894346
4,2018-10-09,-0.762045


In [36]:
#binarizing sentiment on -1.48 mean value
sentiment['binary_sentiment'] = np.where(sentiment['sentiment'] >= -1.52, 1, 0)

In [40]:
#merge text and target data
x_y_complete = sentiment.merge(text_day_grouped, how='inner',  left_on='days', right_on='date_grouped')

In [41]:
#review merged data
x_y_complete.head()

Unnamed: 0,days,sentiment,binary_sentiment,year,month,day,text_token,date_grouped
0,2018-10-05,-1.119873,1,2018,10,5,"['kuala', 'lumpur', 'oct', '4', 'thomson', 're...",2018-10-05
1,2018-10-06,-0.847089,1,2018,10,6,"['past', 'couple', 'week', 'see', 'mr.', 'trum...",2018-10-06
2,2018-10-07,-1.485399,1,2018,10,7,"['couple', 'contact', 'december', '2016', 'was...",2018-10-07
3,2018-10-08,-0.894346,1,2018,10,8,"['cheltenham', 'england', 'thomson', 'reuters'...",2018-10-08
4,2018-10-09,-0.762045,1,2018,10,9,"['stockholm', 'reuters', 'americans', 'william...",2018-10-09


In [42]:
#set x and y
X = x_y_complete['text_token']
y = x_y_complete['binary_sentiment']

# TF-IDF with Logistic Regression

- Training score: 0.7324675324675325
- Test score: 0.6566265060240963

In [44]:
#test train split
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=.3, stratify=y)
print(f'Split done - X_train shape: {X_train.shape}, X_test shape: {X_test.shape}, y_train shape: {y_train.shape}, y_test shape: {y_test.shape}')

#create vectorizer
bagofwords = TfidfVectorizer(min_df=5)
print('vectorizer done')

#fit vectorizer
print('beginng vectorizer fitting')
bagofwords.fit(X_train)
print('vectorizer fitting complete')


#transform X_train
print('beginning transformation')
X_train_transformed = bagofwords.transform(X_train)
print('X_train transformed')

#transform X_test
X_test_transformed = bagofwords.transform(X_test)
print('X_test_transformed')

#create model
print('creating model')
model = LogisticRegression(C=.1, solver='liblinear')
print('model completed')


#fit model
print('fitting model')
model.fit(X_train_transformed, y_train)
print('model fitted')

#score training set 
print('scoring training data')
train_score = model.score(X_train_transformed, y_train)

#score test set
print('scoring test data')
test_score = model.score(X_test_transformed, y_test)

print(f'Training score: {train_score}')
print(f'Test score: {test_score}')
#return (bagofwords, model, X_train_transformed, X_test_transformed, y_train, y_test)


Split done - X_train shape: (385,), X_test shape: (166,), y_train shape: (385,), y_test shape: (166,)
vectorizer done
beginng vectorizer fitting
vectorizer fitting complete
beginning transformation
X_train transformed
X_test_transformed
creating model
model completed
fitting model
model fitted
scoring training data
scoring test data
Training score: 0.7324675324675325
Test score: 0.6566265060240963


# Grid search with TF-IDF data, models = Logistic Regression, Random Forest, SVC, SGDClassifier

- Best model - SGDClassifier
- Train score: 100 percent
- Test score: 67 percent 



In [45]:
from tempfile import mkdtemp
cachedir = mkdtemp()
estimators = [('model', LogisticRegression())]
pipe = Pipeline(estimators, memory = cachedir)

param_grid = [{'model': [LogisticRegression()],
 
             'model__C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
             'model__solver': ['liblinear', 'newton-cg', 'sag', 'saga','lbfgs']},\
              
             {'model': [SVC()],
            'model__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
             'model__C': [0.001, 0.01, 0.1, 1, 10]},\
              
             {'model': [RandomForestClassifier()]},
              
             {'model': [SGDClassifier()],
            'model__alpha': (0.00001, 0.000001),
            'model__penalty': ('l2', 'elasticnet'),
            'model__max_iter': (10, 50, 80)}]
                       


grid = GridSearchCV(pipe, param_grid=param_grid, n_jobs=-1, cv=5, verbose=10)

fittedgrid = grid.fit(X_train_transformed, y_train)

Fitting 5 folds for each of 73 candidates, totalling 365 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   22.5s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:   29.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   35.6s
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:   59.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1

In [46]:
#score training data
fittedgrid.score(X_train_transformed, y_train)

1.0

In [47]:
#score test data
fittedgrid.score(X_test_transformed, y_test)

0.6686746987951807

In [49]:
#review best model (which was not very good)
fittedgrid.best_estimator_

Pipeline(memory='/tmp/tmpdkl336el',
         steps=[('model',
                 SGDClassifier(alpha=1e-05, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=50, n_iter_no_change=5, n_jobs=None,
                               penalty='elasticnet', power_t=0.5,
                               random_state=None, shuffle=True, tol=0.001,
                               validation_fraction=0.1, verbose=0,
                               warm_start=False))],
         verbose=False)

# Grid search with TF-IDF data, models = Logistic Regression, Random Forest, SVC, SGDClassifier, and XGBoost

- Added XGBoost to the mix
- Best model - Logistic Regression
- Train score: 98.7 percent
- Test score: 67 percent 



In [56]:
from tempfile import mkdtemp
cachedir = mkdtemp()
estimators = [('model', LogisticRegression())]
pipe = Pipeline(estimators, memory = cachedir)

param_grid = [{'model': [LogisticRegression()],
 
             'model__C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
             'model__solver': ['liblinear', 'newton-cg', 'sag', 'saga','lbfgs']},\
              
             {'model': [SVC()],
            'model__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
             'model__C': [0.001, 0.01, 0.1, 1, 10]},\
              
             {'model': [RandomForestClassifier()]},
              
             {'model': [SGDClassifier()],
            'model__alpha': (0.00001, 0.000001),
            'model__penalty': ('l2', 'elasticnet'),
            'model__max_iter': (10, 50, 80)},
              
            {'model': [XGBClassifier(n_jobs=-1)],
              'model__n_estimators': np.arange(1,500,10),
             'model__learning_rate': [0.25, 0.5, 1]}]
                       


grid = GridSearchCV(pipe, param_grid=param_grid, n_jobs=-1, cv=5, verbose=10)

fittedgrid = grid.fit(X_train_transformed, y_train)

Fitting 5 folds for each of 223 candidates, totalling 1115 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   21.8s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   25.6s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   34.3s
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:   57.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1

In [57]:
#score train
fittedgrid.score(X_train_transformed, y_train)

0.987012987012987

In [58]:
#score test
fittedgrid.score(X_test_transformed, y_test)

0.6686746987951807

In [59]:
#get best model
fittedgrid.best_estimator_

Pipeline(memory='/tmp/tmp_iffnu51',
         steps=[('model',
                 LogisticRegression(C=10, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='saga', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)