# Variants of bagofwords, ngrams, and Naive Bayes and Logistic Regression

- V_4 Data, Target: Google Trend, Row: Hourly articles, Time: 2015 - 2020 w/ gaps, no shift in target time
- TF-IDF with ngrams
- TF-IDF without ngrams
- CountVectorizer with and without ngrams
- CountVectorizer with max_features
- Naive Bayes
- Logistic Regression

# Imports and Installs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import StandardScaler

# Reading Data and Transformation

In [2]:
#reading in gdelt V4 data
df = pd.read_csv('/floyd/home/Capstone/cap_notebooks/data/master_data_set/text_with_tokens_52k.csv')

display(df.shape)

display(df.head())

#extracting date from gkgcode
df['date_time']  = df['gkgcode'].apply(lambda x: x[:14])


#creating date time object
df['date_time'] = pd.to_datetime(df['date_time'], format='%Y%m%d%H%M%S')

df.set_index('date_time', inplace=True)

display(df.head())

#resample text data hourly and join text tokens

test_resample = df.resample('h')['text'].agg(lambda column: "".join(column))

display(test_resample.shape)

test_resample.apply(lambda x: len(x) < 100).sum()

test_resample = test_resample.reset_index()

### Reading in Google Trends Data

gtrends = pd.read_csv('/floyd/home/Capstone/cap_notebooks/data/google_trends/gtrends_2015-2020_clean.csv')

#set dattime
gtrends['date'] = pd.to_datetime(gtrends['date'])

display(gtrends.shape)

#merge text data with gtrends data
gtrends_gdelt = gtrends.merge(test_resample, how='inner', left_on = 'date', right_on = 'date_time' )

#create boolean mask to drop strings with less than 200 characters
gtrends_gdelt['text_bool'] = gtrends_gdelt['text'].apply(lambda x: len(x) < 200)

#drop text with less than 200 chracters
gtrends_gdelt = gtrends_gdelt[gtrends_gdelt['text_bool']==False]

gtrends_gdelt.set_index('date', inplace=True)

#filter data to text and target
depression_target = gtrends_gdelt[['text', 'depression']]

#checking distribution of depression volume 
depression_target['depression'].plot()

depression_target.describe()

plt.figure()
plt.hist(depression_target['depression'])
plt.show()

#binarize depression target
depression_target['d_search_bin'] = np.where(depression_target['depression'] >= 36, 1, 0)

#checking distribution of binarized target
depression_target['d_search_bin'].value_counts()

#setting x, y
X = depression_target['text']
y = depression_target['d_search_bin']

# TF-IDF Bag of words with ngram_range(1,3)

In [24]:
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=.3, stratify=y)
print(f'Split done - X_train shape: {X_train.shape}, X_test shape: {X_test.shape}, y_train shape: {y_train.shape}, y_test shape: {y_test.shape}')

#create vectorizer
bagofwords = TfidfVectorizer(min_df=5, ngram_range=(1,3))
print('vectorizer done')

#fit vectorizer
print('beginng vectorizer fitting')
bagofwords.fit(X_train)
print('vectorizer fitting complete')

#transform X_train
print('beginning transformation')
X_train_transformed = bagofwords.transform(X_train)
print('X_train transformed')

#transform X_test
X_test_transformed = bagofwords.transform(X_test)
print('X_test_transformed')

Split done - X_train shape: (23244,), X_test shape: (9963,), y_train shape: (23244,), y_test shape: (9963,)
vectorizer done
beginng vectorizer fitting
vectorizer fitting complete
beginning transformation
X_train transformed
X_test_transformed


# BernoulliNB with TF-IDF and unigam, bigram, and trigram
- Test acc: 60.7%
- Train acc: 74.8%

In [26]:
#fit and score test data
model = BernoulliNB()
model.fit(X_train_transformed, y_train)
model.score(X_test_transformed, y_test)

0.607447555957041

In [28]:
#score training data
model.score(X_train_transformed, y_train)

0.7448373773877129

# TF-IDF with unigram only

In [29]:
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=.3, stratify=y)
print(f'Split done - X_train shape: {X_train.shape}, X_test shape: {X_test.shape}, y_train shape: {y_train.shape}, y_test shape: {y_test.shape}')

#create vectorizer
bagofwords = TfidfVectorizer(min_df=5)
print('vectorizer done')

#fit vectorizer
print('beginng vectorizer fitting')
bagofwords.fit(X_train)
print('vectorizer fitting complete')


#transform X_train
print('beginning transformation')
X_train_transformed = bagofwords.transform(X_train)
print('X_train transformed')

Split done - X_train shape: (23244,), X_test shape: (9963,), y_train shape: (23244,), y_test shape: (9963,)
vectorizer done
beginng vectorizer fitting
vectorizer fitting complete
beginning transformation
X_train transformed


In [35]:
#transform X_test
X_test_transformed = bagofwords.transform(X_test)
print('X_test_transformed')

X_test_transformed


# BernoulliNB with TF-IDF, unigram

- Train acc: 58.2 percent
- Test acc: 54.78 percent


In [32]:
#instantiate and fit model
model = BernoulliNB()
model.fit(X_train_transformed, y_train)
#model.score(X_test_transformed, y_test)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [33]:
#score train
model.score(X_train_transformed, y_train)

0.5819996558251592

In [36]:
#score test
model.score(X_test_transformed, y_test)

0.5475258456288267

# CountVectorizer Bagofwords, unigram

In [37]:
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=.3, stratify=y)
print(f'Split done - X_train shape: {X_train.shape}, X_test shape: {X_test.shape}, y_train shape: {y_train.shape}, y_test shape: {y_test.shape}')

#create vectorizer
bagofwords = CountVectorizer(min_df=5)
print('vectorizer done')

#fit vectorizer
print('beginng vectorizer fitting')
bagofwords.fit(X_train)
print('vectorizer fitting complete')


#transform X_train
print('beginning transformation')
X_train_transformed = bagofwords.transform(X_train)
print('X_train transformed')

#transform X_test
X_test_transformed = bagofwords.transform(X_test)
print('X_test_transformed')

Split done - X_train shape: (23244,), X_test shape: (9963,), y_train shape: (23244,), y_test shape: (9963,)
vectorizer done
beginng vectorizer fitting
vectorizer fitting complete
beginning transformation
X_train transformed
X_test_transformed


# BernoulliNB with CountVect
- Train acc: 74 percent
- Test acc: 60.7 percent

In [40]:
model = BernoulliNB()
model.fit(X_train_transformed, y_train)
display(model.score(X_train_transformed, y_train))
display(model.score(X_test_transformed, y_test))

0.740234038891757

0.6069456990866205

# CountVectorizer with unigram, bigram, and trigram

In [39]:
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=.3, stratify=y)
print(f'Split done - X_train shape: {X_train.shape}, X_test shape: {X_test.shape}, y_train shape: {y_train.shape}, y_test shape: {y_test.shape}')

#create vectorizer
bagofwords = CountVectorizer(min_df=5, ngram_range=(1,3))
print('vectorizer done')

#fit vectorizer
print('beginng vectorizer fitting')
bagofwords.fit(X_train)
print('vectorizer fitting complete')


#transform X_train
print('beginning transformation')
X_train_transformed = bagofwords.transform(X_train)
print('X_train transformed')

#transform X_test
X_test_transformed = bagofwords.transform(X_test)
print('X_test_transformed')

Split done - X_train shape: (23244,), X_test shape: (9963,), y_train shape: (23244,), y_test shape: (9963,)
vectorizer done
beginng vectorizer fitting
vectorizer fitting complete
beginning transformation
X_train transformed
X_test_transformed


# BernoulliNB with CountVectorizer, unigram, bigram, and trigram
- Train acc: 74 percent
- Test acc: 60.7 percent

In [41]:
model = BernoulliNB()
model.fit(X_train_transformed, y_train)
display(model.score(X_test_transformed, y_test))
display(model.score(X_train_transformed, y_train))

0.6069456990866205

0.740234038891757

# CountVectorizer with unigram, bigram, and trigram, and max_features=10000 and StandardScaler

In [None]:
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=.3, stratify=y)
print(f'Split done - X_train shape: {X_train.shape}, X_test shape: {X_test.shape}, y_train shape: {y_train.shape}, y_test shape: {y_test.shape}')

#create vectorizer
bagofwords = CountVectorizer(min_df=5, ngram_range=(1,3), max_features=10000)
print('vectorizer done')

#fit vectorizer
print('beginng vectorizer fitting')
bagofwords.fit(X_train)
print('vectorizer fitting complete')


#transform X_train
print('beginning transformation')
X_train_transformed = bagofwords.transform(X_train)
print('X_train transformed')

#transform X_test
X_test_transformed = bagofwords.transform(X_test)
print('X_test_transformed')

Split done - X_train shape: (23244,), X_test shape: (9963,), y_train shape: (23244,), y_test shape: (9963,)
vectorizer done
beginng vectorizer fitting


In [33]:
#fit and transform with standardscaler
scaler = StandardScaler(with_mean=False)
scaler.fit(X_train_transformed)
X_train_transformed = scaler.transform(X_train_transformed)
X_test_transformed = scaler.transform(X_test_transformed)

# Logistic Regression with CountVect, unigram, bigram, trigram, max_features=10,000 and standard scalling
- Train acc: 63.9 percent
- Test acc: 59.6 percent

In [35]:
print('creating model')
model = LogisticRegression(C=.01, solver='saga', max_iter=10000)
print('model completed')


#fit model
print('fitting model')
model.fit(X_train_transformed, y_train)
print('model fitted')

#score training set 
print('scoring training data')
train_score = model.score(X_train_transformed, y_train)

#score test set
print('scoring test data')
test_score = model.score(X_test_transformed, y_test)

print(f'Training score: {train_score}')
print(f'Test score: {test_score}')

creating model
model completed
fitting model
model fitted
scoring training data
scoring test data
Training score: 0.6394768542419549
Test score: 0.5956037338151159
