## Imports and Installs

In [1]:
#installing jupyter lab extensions I like to use
!jupyter labextension install @jupyterlab/toc;
!jupyter labextension install @ijmbarr/jupyterlab_spellchecker;
!jupyter labextension install @aquirdturtle/collapsible_headings;
!jupyter labextension install @jupyter-widgets/jupyterlab-manager;



Building jupyterlab assets (build:prod:minimize)
Building jupyterlab assets (build:prod:minimize)
Building jupyterlab assets (build:prod:minimize)
Building jupyterlab assets (build:prod:minimize)


In [1]:
#installing and importing spacy, which I use for tokenizing text
!pip install spacy;
import spacy

You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
#importing spacy's tokenizer language model to support slightly more advanced tokenizing
!python -m spacy download en_core_web_sm;

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 714 kB/s eta 0:00:01
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.5-py3-none-any.whl size=12011738 sha256=2025bde9fd5870ab410d9b525f7e1a6d3ebd12066cbcd24c9e4f01f91bbde7b8
  Stored in directory: /tmp/pip-ephem-wheel-cache-ym2q6i0i/wheels/51/19/da/a3885266a3c241aff0ad2eb674ae058fd34a4870fef1c0a5a0
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-2.2.5
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [2]:
#imports
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import seaborn as sns
import re
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier

## Tokenizer

In [3]:
#loads spacy's small language model for tokenizer, disabling parameters not needed to speed up performance
nlp = spacy.load("en_core_web_sm", disable=['tagger', 'parser', 'ner'])

In [4]:
#tokenizer helper function, that runs as part of data processing cell
def tokenizer(text, nlp):

    token_list = []
    doc = nlp(text)
    for token in doc:
        if token.is_stop == False and token.is_punct==False:
            if token.text != ' ':
                token_list.append((token.lemma_).lower())
    str_tokens = ' '.join(token_list)
    return str_tokens

## Reading and shaping data for Modeling

In [5]:
#reading in text data from GDELT
print('reading in text data')
df = pd.read_csv('/floyd/home/Capstone/cap_notebooks/data/master_data_set/gdelt_text_tone_complete_oct_22.csv')

display(f'gdelt_df_shape: {df.shape}')
print('\n')

print('DF HEAD')
display(df.head())

print('\n')
print('setting datatime')

#converts datetime
df['date_time'] = pd.to_datetime(df['DATE'])

#sets datetime index
df.set_index('date_time', inplace=True)

print('\n')
print('Filling empty title columns')

#fills missing title with no_title
df['title'].fillna('no_title', inplace = True)

print('\n')
print('tokenizing titles')

#tokenzies title
df['title_tokens'] = df['title'].apply(lambda x: tokenizer(str(x), nlp))

print('\n')
print('joining text and title')

#joins title and text
df['title_text'] = df['title_tokens'] + df['text_tokens']

#display(df.head())
print('Reshaped GDELT DF')
display(f'df: {df.shape}')

print('\n')
print('resampling at 1h')

#resample data into 1 hour increments. Joins articles into one giant string for each hour
test_resample = df.resample('1h')['title_text'].agg(lambda column: "".join(column))
#convert to dataframe
test_resample = pd.DataFrame(test_resample)

print('\n')
print('GDELT Text Resampled by 1 hour')
display(test_resample.head(3))

print('\n')

print('Getting GDELT tonal scores and resampling to 1h by calculating mean')

#grabs scores (I had aspirations of using this data in my model)
scores = df[['avg_tone', 'pos_words', 'neg_words', 'polarity', 'act_ref_density', 'self_group_density', 'word_count']]

#resample scores metadata
scores = scores.resample('1h').mean()

print('\n')
print('GDELT Tonal Dataset')
display(scores.head(3))
print('\n')

#Joining resampled text data with resampled tonal data
print('Joining resampled text data with resampled tonal data')
test_resample = pd.concat([test_resample, scores], axis=1)

#resetting index
test_resample.reset_index(inplace=True)

#display text and tonal resampled data set
print('Text and Tonal resampled GDELT data')
display(test_resample.head(3))
print('\n')

print('Shape of new tonal text GDELT data')
display(f'clean text shape: {test_resample.shape}')

#checking for empty title_text strings. Can't use null here because elements in text are a string, even empty string
print('\n')
print('Checking for empty strings in text feature')
useful_sample_size = (test_resample['title_text'].str.len()>0).sum()

print('\n')
print('Displaying number of empty strings')
display(f'text > 0: {useful_sample_size}')


print('\n')

#Adding padding to text to ensure complete join with Google Trends Target Data
#this is important when I `shift` the Google Trends target data
#Empty strings will eventually be dropped
print('Adding padding to text to ensure complete join with Google Trends Target Data')

#getting column headers
columns = list(test_resample.columns)

#creating list of data for padding
date_list = list(pd.date_range(start='2020-10-07 21:00:00', end='2020-10-11 00:00:00', freq='1h'))

#creating dict of columns with date_list and empty_string
data = {'date_time': date_list, 'title_text': 'empty_string'}

#creatinge padding dataframe
df_holder = pd.DataFrame(data,columns=columns)

#appending padding dataframe to text dataframe
print('\n')
print('appending new padding to test_resample')
test_resample = test_resample.append(df_holder)

print('\n')
print('Reviewing reshaped GDELT text and tonal feature set')
print('\n')

#read in gtrend target data
print('Reading in google trends')

gtrends = pd.read_csv('/floyd/home/Capstone/cap_notebooks/data/google_trends/gtrends_2015-2020_clean.csv')

print('\n')
display(f'gtrend shape: {gtrends.shape}')

print('grouping gtrends data and removing duplicates')
gtrends = gtrends.groupby('date').mean()

display(f'gtrends duplicates removed: {gtrends.shape}')

print('removing extra google trends data')

#returning only Gtrends data that fits in time range of text data
gtrends = gtrends['2015-03-02 00:00:00': '2020-10-10 00:00:00']

#reset index
gtrends.reset_index(inplace=True)

#drop unneeded columns
gtrends.drop(['Unnamed: 0', 'isPartial'], axis=1, inplace=True)

print('setting datetime on gtrends')

#convert date col to datetime
gtrends['date'] = pd.to_datetime(gtrends['date'])

display(f'gtrends timeframe reduced: {gtrends.shape}')

#merging text and trends
gtrends_gdelt = gtrends.merge(test_resample, how='inner', left_on = 'date', right_on = 'date_time')

#dropping extra data column
gtrends_gdelt.drop('date_time', axis=1, inplace=True)
display(f'gtrends_gdelt shape: {gtrends_gdelt.shape}')
display(f'gtrends_gdelt cols: {gtrends_gdelt.columns}')

#checking complete records
complete_records = (gtrends_gdelt['title_text'].str.len()>0).sum()
print(f'gtrends_gdelt complete records: {complete_records}')

#binarizing depression
gtrends_gdelt['depression_binary'] = np.where(gtrends_gdelt['depression'] >= 36, 1, 0)

#grabbing value counts to ensure they are balanced
dep_val_count = gtrends_gdelt['depression_binary'].value_counts()

print(f'binary_depression value counts: {dep_val_count}')

#shifting trends data to so target is now 12 hours later that text feature data
print('shifting google trends by -12 houors')
gtrends_gdelt['shifted_12h'] = gtrends_gdelt['depression_binary'].shift(-12)

#dropping NA rows, which should just be 24 end rows
gtrends_gdelt.dropna(inplace=True)

#grapping complete rows

print('setting final df')

#removing rows that don't have article text data
gtrends_text_final = gtrends_gdelt[gtrends_gdelt['title_text'].str.len()>12]

print(gtrends_text_final.info())
print('\n')
display(gtrends_text_final.head())
print('\n')

#creating final dataset for machine learning. Keeping many of these columns because of aspriation hope that I can test in my mod
x_y = gtrends_text_final[['title_text', 'avg_tone', 'pos_words', 'neg_words', 'polarity', 'act_ref_density', 'self_group_density', 'word_count', 'shifted_12h']]
x_y.head()


reading in text data


'gdelt_df_shape: (214706, 12)'



DF HEAD


Unnamed: 0.1,Unnamed: 0,GKGRECORDID,DATE,title,text_tokens,avg_tone,pos_words,neg_words,polarity,act_ref_density,self_group_density,word_count
0,0,20150302100000-674,2015-03-02 10:00:00,america clean energy laggard,answer resound myriad claim energy need debunk...,0.350631,2.734923,2.384292,5.119215,16.760168,0.420757,1186
1,1,20150302153000-229,2015-03-02 15:30:00,watch meet press treat climate change big joke,hear sen. james inhofe r okla astonishingly ch...,-0.952381,3.492063,4.444444,7.936508,26.984127,1.428571,576
2,2,20150302163000-237,2015-03-02 16:30:00,no_title,mary bowerman usa today network visitors show ...,0.0,1.814059,1.814059,3.628118,25.396825,0.0,405
3,3,20150302180000-1352,2015-03-02 18:00:00,russian energy deal comes contentious time,mr. fridman business track record hard cameron...,-1.147541,1.803279,2.95082,4.754098,19.508197,0.409836,1119
4,4,20150302203000-163,2015-03-02 20:30:00,climate change cause syrian civil war,climate change spark historic drought syria co...,-8.054523,0.371747,8.42627,8.798017,24.039653,0.247831,743




setting datatime


Filling empty title columns


tokenizing titles


joining text and title
Reshaped GDELT DF


'df: (214706, 14)'



resampling at 1h


GDELT Text Resampled by 1 hour


Unnamed: 0_level_0,title_text
date_time,Unnamed: 1_level_1
2015-03-02 10:00:00,america clean energy laggardanswer resound myr...
2015-03-02 11:00:00,
2015-03-02 12:00:00,




Getting GDELT tonal scores and resampling to 1h by calculating mean


GDELT Tonal Dataset


Unnamed: 0_level_0,avg_tone,pos_words,neg_words,polarity,act_ref_density,self_group_density,word_count
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-03-02 10:00:00,0.350631,2.734923,2.384292,5.119215,16.760168,0.420757,1186.0
2015-03-02 11:00:00,,,,,,,
2015-03-02 12:00:00,,,,,,,




Joining resampled text data with resampled tonal data
Text and Tonal resampled GDELT data


Unnamed: 0,date_time,title_text,avg_tone,pos_words,neg_words,polarity,act_ref_density,self_group_density,word_count
0,2015-03-02 10:00:00,america clean energy laggardanswer resound myr...,0.350631,2.734923,2.384292,5.119215,16.760168,0.420757,1186.0
1,2015-03-02 11:00:00,,,,,,,,
2,2015-03-02 12:00:00,,,,,,,,




Shape of new tonal text GDELT data


'clean text shape: (49115, 9)'



Checking for empty strings in text feature


Displaying number of empty strings


'text > 0: 29822'



Adding padding to text to ensure complete join with Google Trends Target Data


appending new padding to test_resample


Reviewing reshaped GDELT text and tonal feature set


Reading in google trends




'gtrend shape: (66741, 8)'

grouping gtrends data and removing duplicates


'gtrends duplicates removed: (50096, 7)'

removing extra google trends data
setting datetime on gtrends


'gtrends timeframe reduced: (48538, 6)'

'gtrends_gdelt shape: (48528, 14)'

"gtrends_gdelt cols: Index(['date', 'depression', 'anxiety', 'government', 'politics', 'democracy',\n       'title_text', 'avg_tone', 'pos_words', 'neg_words', 'polarity',\n       'act_ref_density', 'self_group_density', 'word_count'],\n      dtype='object')"

gtrends_gdelt complete records: 29441
binary_depression value counts: 1    24881
0    23647
Name: depression_binary, dtype: int64
shifting google trends by -24 houors
setting final df
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29389 entries, 0 to 48475
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   date                29389 non-null  datetime64[ns]
 1   depression          29389 non-null  float64       
 2   anxiety             29389 non-null  float64       
 3   government          29389 non-null  float64       
 4   politics            29389 non-null  float64       
 5   democracy           29389 non-null  float64       
 6   title_text          29389 non-null  object        
 7   avg_tone            29389 non-null  float64       
 8   pos_words           29389 non-null  float64       
 9   neg_words           29389 non-null  float64       
 10  polarity            29389 non-

Unnamed: 0,date,depression,anxiety,government,politics,democracy,title_text,avg_tone,pos_words,neg_words,polarity,act_ref_density,self_group_density,word_count,depression_binary,shifted_12h
0,2015-03-02 10:00:00,15.0,16.0,71.0,4.0,2.0,america clean energy laggardanswer resound myr...,0.350631,2.734923,2.384292,5.119215,16.760168,0.420757,1186.0,0,0.0
5,2015-03-02 15:00:00,18.0,13.0,42.0,6.0,5.0,watch meet press treat climate change big joke...,-0.952381,3.492063,4.444444,7.936508,26.984127,1.428571,576.0,0,0.0
6,2015-03-02 16:00:00,17.0,13.0,41.0,6.0,5.0,no_titlemary bowerman usa today network visito...,0.0,1.814059,1.814059,3.628118,25.396825,0.0,405.0,0,0.0
8,2015-03-02 18:00:00,18.0,14.0,39.0,6.0,4.0,russian energy deal come contentious timemr. f...,-1.147541,1.803279,2.95082,4.754098,19.508197,0.409836,1119.0,0,0.0
10,2015-03-02 20:00:00,17.0,13.0,35.0,5.0,4.0,climate change cause syrian civil warclimate c...,-5.368873,0.909366,6.278239,7.187605,24.159231,0.123712,616.333333,0,0.0






Unnamed: 0,title_text,avg_tone,pos_words,neg_words,polarity,act_ref_density,self_group_density,word_count,shifted_12h
0,america clean energy laggardanswer resound myr...,0.350631,2.734923,2.384292,5.119215,16.760168,0.420757,1186.0,0.0
5,watch meet press treat climate change big joke...,-0.952381,3.492063,4.444444,7.936508,26.984127,1.428571,576.0,0.0
6,no_titlemary bowerman usa today network visito...,0.0,1.814059,1.814059,3.628118,25.396825,0.0,405.0,0.0
8,russian energy deal come contentious timemr. f...,-1.147541,1.803279,2.95082,4.754098,19.508197,0.409836,1119.0,0.0
10,climate change cause syrian civil warclimate c...,-5.368873,0.909366,6.278239,7.187605,24.159231,0.123712,616.333333,0.0


In [8]:
gtrends_text_final.to_csv('final_aggregated_data_set_Oct_26.csv')

# TF-IDF Models

### TF-IDF Vectorization

In [11]:

#setting x y
X = x_y['title_text']
y = x_y['shifted_12h']

X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=.2, stratify=y)
print(f'Split done - X_train shape: {X_train.shape}, X_test shape: {X_test.shape}, y_train shape: {y_train.shape}, y_test shape: {y_test.shape}')

#create vectorizer
bagofwords = TfidfVectorizer(min_df=5)
print('vectorizer done')

#fit vectorizer
print('beginng vectorizer fitting')
bagofwords.fit(X_train)
print('vectorizer fitting complete')


#transform X_train
print('beginning transformation')
X_train_transformed = bagofwords.transform(X_train)
print('X_train transformed')

#transform X_test
X_test_transformed = bagofwords.transform(X_test)
print('X_test_transformed')


Split done - X_train shape: (23511,), X_test shape: (5878,), y_train shape: (23511,), y_test shape: (5878,)
vectorizer done
beginng vectorizer fitting
vectorizer fitting complete
beginning transformation
X_train transformed
X_test_transformed


### Logistic Regression with TF-IDF, Google Trends Target

In [7]:

#create model
print('creating model')
model = LogisticRegression(C=.1, solver='liblinear')
print('model completed')


#fit model
print('fitting model')
model.fit(X_train_transformed, y_train)
print('model fitted')

#score training set 
print('scoring training data')
train_score = model.score(X_train_transformed, y_train)

#score test set
print('scoring test data')
test_score = model.score(X_test_transformed, y_test)

print(f'Training score: {train_score}')
print(f'Test score: {test_score}')



Split done - X_train shape: (23511,), X_test shape: (5878,), y_train shape: (23511,), y_test shape: (5878,)
vectorizer done
beginng vectorizer fitting
vectorizer fitting complete
beginning transformation
X_train transformed
X_test_transformed
creating model
model completed
fitting model
model fitted
scoring training data
scoring test data
Training score: 0.6719407936710476
Test score: 0.6097312010888057


### Grid Search with Sklearn, TF-IDF, Models - LogReg

In [30]:


from tempfile import mkdtemp
cachedir = mkdtemp()
estimators = [('model', LogisticRegression())]
pipe = Pipeline(estimators, memory = cachedir)

param_grid = [{'model': [LogisticRegression()],
 
             'model__C': [0.001, 0.01, 0.1, 1, 10],
             'model__solver': ['liblinear', 'newton-cg', 'sag', 'saga','lbfgs']}]
              
#              {'model': [SVC()],
#             'model__gamma': [0.001, 0.01, 0.1, 1, 10],
#              'model__C': [0.001, 0.01, 0.1, 1, 10]},\
              
#              {'model': [RandomForestClassifier()]},
              
#              {'model': [SGDClassifier()],
#             'model__alpha': (0.00001, 0.000001),
#             'model__penalty': ('l2', 'elasticnet'),
#             'model__max_iter': (10, 50, 80)}]
              
#             {'model': [XGBClassifier(n_jobs=-1)],
#               'model__n_estimators': np.arange(1,600,200),
#              'model__learning_rate': [0.25, 0.5, 1]}]
                       


grid = GridSearchCV(pipe, param_grid=param_grid, n_jobs=-1, cv=3, verbose=10)

fittedgrid = grid.fit(X_train_transformed, y_train)

display(fittedgrid.score(X_train_transformed, y_train))

display(fittedgrid.score(X_test_transformed, y_test))

display(fittedgrid.best_estimator_)

display(fittedgrid.best_params_)

display(fittedgrid.best_score_)

display(fittedgrid.param_grid)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  68 out of  75 | elapsed:  1.6min remaining:    9.7s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:  1.7min finished


0.9427076687507975

0.631337189520245

Pipeline(memory='/tmp/tmp4v1paivv',
         steps=[('model',
                 LogisticRegression(C=10, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

{'model': LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False),
 'model__C': 10,
 'model__solver': 'liblinear'}

0.6414018969843903

[{'model': [LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=100,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                      warm_start=False)],
  'model__C': [0.001, 0.01, 0.1, 1, 10],
  'model__solver': ['liblinear', 'newton-cg', 'sag', 'saga', 'lbfgs']}]

### 'Grid Search' with H2O Auto ML

In [31]:
#H2O specific imports
!pip install requests;
!pip install tabulate;
!pip install "colorama>=0.3.8";
!pip install future;

!pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o;

import h2o
from scipy import sparse
from h2o.automl import H2OAutoML

h2o.init()

You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m
Collecting colorama>=0.3.8
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.4
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m
Looking in links: http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html
Collecting h2o
  Downloading h2o-3.30.1.3.tar.gz (129.4 MB)
[K     |████████████████████████████████| 129.4 MB 35 kB/s s eta 0:00:01
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25ldone
[?25h  Created wheel for h2o: filename=h2o-3.30.1.3-py2.py3-none-any.whl size=129446676 sha256=a2b21d6f5e4da4f2

0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.3
H2O_cluster_version_age:,27 days
H2O_cluster_name:,H2O_from_python_unknownUser_g29jge
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.381 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [None]:
#preparing data for H2O
train = sparse.hstack((X_train_transformed, np.array(y_train)[:,None]))
test = sparse.hstack((X_test_transformed, np.array(y_test)[:,None]))

train_h2o = h2o.H2OFrame(train)
test_h2o = h2o.H2OFrame(test)

y_train_h2o = train_h2o.col_names[-1]
X_train_h2o = train_h2o.col_names[:33686]
y_test_h2o = test_h2o.col_names[-1]
X_test_h2o = test_h2o.col_names[:33686]
train_h2o[y_train_h2o] = train_h2o[y_train_h2o].asfactor()
test_h2o[y_train_h2o] = test_h2o[y_train_h2o].asfactor()

from h2o.automl import H2OAutoML

aml = H2OAutoML(max_models=10, max_runtime_secs=500, balance_classes=True)
aml.train(x=X_train_h2o, y=y_train_h2o, training_frame=train_h2o)

lb = aml.leaderboard

lb.head(rows=lb.nrows)

display(aml.leader)

perf = aml.leader.model_performance(test_h2o)

perf

Parse progress: |████████████████████████████████████████████████████████

# MVP - Article with Binary Polarity Target

In [None]:
#reading in gdelt master
print('reading in text daya')
df = pd.read_csv('/floyd/home/Capstone/cap_notebooks/data/master_data_set/gdelt_text_tone_complete_oct_22.csv')

display(f'df: {df.shape}')

#display(df.head())

print('setting datatime')
#cleans date time
#df['date_time']  = df['gkgcode'].apply(lambda x: x[:14])



#converts datetime
df['date_time'] = pd.to_datetime(df['DATE'])

#sets datetime index
df.set_index('date_time', inplace=True)

print('filling empty title columns')
#fills missing title with no_title
df['title'].fillna('no_title', inplace = True)

print('tokenizing titles')
#tokenzies title
df['title_tokens'] = df['title'].apply(lambda x: tokenizer(str(x), nlp))

print('joining text and title')
#joins title and text
df['title_text'] = df['title_tokens'] + df['text_tokens']

#display(df.head())
display(f'df: {df.shape}')

In [None]:
#binarize polarity, avg tone
#set new x and y

In [None]:

#setting x y


X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=.2, stratify=y)
print(f'Split done - X_train shape: {X_train.shape}, X_test shape: {X_test.shape}, y_train shape: {y_train.shape}, y_test shape: {y_test.shape}')

#create vectorizer
bagofwords = TfidfVectorizer(min_df=5)
print('vectorizer done')

#fit vectorizer
print('beginng vectorizer fitting')
bagofwords.fit(X_train)
print('vectorizer fitting complete')


#transform X_train
print('beginning transformation')
X_train_transformed = bagofwords.transform(X_train)
print('X_train transformed')

#transform X_test
X_test_transformed = bagofwords.transform(X_test)
print('X_test_transformed')

#create model
print('creating model')
model = LogisticRegression(C=.1, solver='liblinear')
print('model completed')


#fit model
print('fitting model')
model.fit(X_train_transformed, y_train)
print('model fitted')

#score training set 
print('scoring training data')
train_score = model.score(X_train_transformed, y_train)

#score test set
print('scoring test data')
test_score = model.score(X_test_transformed, y_test)

print(f'Training score: {train_score}')
print(f'Test score: {test_score}')



# Log Reg, GTrends Target, Numeric Tonal Features Only

In [None]:
#set X_Y
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=.2, stratify=y)
print(f'Split done - X_train shape: {X_train.shape}, X_test shape: {X_test.shape}, y_train shape: {y_train.shape}, y_test shape: {y_test.shape}')

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_transformed = scaler.transform(X_train)
X_test_transformed = scaler.transform(X_test)

In [None]:
from tempfile import mkdtemp
cachedir = mkdtemp()
estimators = [('model', LogisticRegression())]
pipe = Pipeline(estimators, memory = cachedir)

param_grid = [{'model': [LogisticRegression()],
 
             'model__C': [0.001, 0.01, 0.1, 1, 10],
             'model__solver': ['liblinear', 'newton-cg', 'sag', 'saga','lbfgs']},\
              
             {'model': [SVC()],
            'model__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
             'model__C': [0.001, 0.01, 0.1, 1, 10]},\
              
             {'model': [RandomForestClassifier()]},
              
             {'model': [SGDClassifier()],
            'model__alpha': (0.00001, 0.000001),
            'model__penalty': ('l2', 'elasticnet'),
            'model__max_iter': (10, 50, 80)},
              
            {'model': [XGBClassifier(n_jobs=-1)],
              'model__n_estimators': np.arange(1,500,100),
             'model__learning_rate': [0.25, 0.5, 1]}]
                       


grid = GridSearchCV(pipe, param_grid=param_grid, n_jobs=-1, cv=5, verbose=10)

fittedgrid = grid.fit(X_train_transformed, y_train)

In [None]:
fittedgrid.score(X_train_transformed, y_train)

fittedgrid.score(X_test_transformed, y_test)

fittedgrid.best_estimator_

fittedgrid.best_params_

fittedgrid.best_score_

fittedgrid.param_grid

# Granger Casuality and Time Series - TBD