# Neural Net Regression and Neural Net with TF-IDF

- V3_Data: Target: Brandwatch Sentiment, Row = Daily articles, Time: 2018 - 2020


# Imports and Installs

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
#import statements
from tensorflow.keras import regularizers
import scipy
from sklearn.metrics import r2_score

from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from tensorflow.keras.layers.experimental import preprocessing

import matplotlib.pyplot as plt

%matplotlib inline

import numpy as np
from sklearn.model_selection import train_test_split


!pip install tensorflow-hub
#!pip install tensorflow-datasets
import tensorflow_hub as hub

import tensorflow as tf

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")


# Read data and Transform

In [3]:
#Read in v3 text data
text = pd.read_csv('/floyd/home/Capstone/cap_notebooks/data/master_data_set/text_with_tokens_52k.csv')

In [4]:
#filter for date and tokens
text = text[['date', 'text_token']]

In [5]:
#check data
text.head()

Unnamed: 0,date,text_token
0,2015-03-02,"['answer', 'resounding', 'myriad', 'claim', 'e..."
1,2015-03-02,"['hear', 'sen.', 'james', 'inhofe', 'r', 'okla..."
2,2015-03-02,"['mary', 'bowerman', 'usa', 'today', 'network'..."
3,2015-03-02,"['mr.', 'fridman', 'business', 'track', 'recor..."
4,2015-03-02,"['climate', 'change', 'spark', 'historic', 'dr..."


In [8]:
#convert date to datetime object
text['date'] = pd.to_datetime(text['date'])

#create day groupby object
grouped_text = text.groupby([text['date'].dt.year, text['date'].dt.month, text['date'].dt.day])

#aggregating tokens by day
text_day_grouped = grouped_text['text_token'].agg(lambda column: "".join(column))

#set as df
text_day_grouped = pd.DataFrame(text_day_grouped)

#rename index
text_day_grouped = text_day_grouped.rename_axis(index=['year', 'month', 'day'])

#reset_index
text_day_grouped = text_day_grouped.reset_index()

In [14]:
text_day_grouped

Unnamed: 0,year,month,day,text_token
0,2015,3,2,"['answer', 'resounding', 'myriad', 'claim', 'e..."
1,2015,3,3,"['scientist', 'center', 'controversy', 'fossil..."
2,2015,3,4,"['scientist', 'step', 'closer', 'understand', ..."
3,2015,3,5,"['high', 'blessed', 'relief', 'finally', 'pres..."
4,2015,3,6,"['california', 'lead', 'nation', 'take', 'acti..."
...,...,...,...,...
1857,2020,10,3,"['calistoga', 'california', 'california', 'fir..."
1858,2020,10,4,"['خطر', 'الإنفلونزا', 'قد', 'يكون', 'أقل', 'هذ..."
1859,2020,10,5,"['london', 'thomson', 'reuters', 'foundation',..."
1860,2020,10,6,"['1', 'president', 'trump', 'americans', 'afra..."


In [15]:
#creates grouping column
text_day_grouped['date_grouped'] = pd.to_datetime(text_day_grouped[['year', 'month', 'day']])

In [16]:
#read in brandwatch sentiment data
sentiment = pd.read_csv('/floyd/home/Capstone/cap_notebooks/data/brandwatch/bw_sentiment_emotion_day/bw_sentiment_2018-2020.csv')

In [17]:
#check data
sentiment.head()

Unnamed: 0.1,Unnamed: 0,days,sentiment
0,1,2018-10-05,-1.119873
1,2,2018-10-06,-0.847089
2,3,2018-10-07,-1.485399
3,4,2018-10-08,-0.894346
4,5,2018-10-09,-0.762045


In [18]:
#drop extra columns
sentiment.drop('Unnamed: 0', axis=1, inplace=True)

In [19]:
#check data
sentiment.head()

Unnamed: 0,days,sentiment
0,2018-10-05,-1.119873
1,2018-10-06,-0.847089
2,2018-10-07,-1.485399
3,2018-10-08,-0.894346
4,2018-10-09,-0.762045


In [20]:
#convert days to datetime object

sentiment['days'] = pd.to_datetime(sentiment['days'])

In [21]:
#merges sentiment and text data
x_y_complete = sentiment.merge(text_day_grouped, how='inner',  left_on='days', right_on='date_grouped')

In [22]:
#check data
x_y_complete.head()

Unnamed: 0,days,sentiment,year,month,day,text_token,date_grouped
0,2018-10-05,-1.119873,2018,10,5,"['kuala', 'lumpur', 'oct', '4', 'thomson', 're...",2018-10-05
1,2018-10-06,-0.847089,2018,10,6,"['past', 'couple', 'week', 'see', 'mr.', 'trum...",2018-10-06
2,2018-10-07,-1.485399,2018,10,7,"['couple', 'contact', 'december', '2016', 'was...",2018-10-07
3,2018-10-08,-0.894346,2018,10,8,"['cheltenham', 'england', 'thomson', 'reuters'...",2018-10-08
4,2018-10-09,-0.762045,2018,10,9,"['stockholm', 'reuters', 'americans', 'william...",2018-10-09


In [23]:
#filter datafram
x_y = x_y_complete[['days', 'text_token','sentiment']]

In [25]:
#check sentiment distribution
x_y.describe()

Unnamed: 0,sentiment
count,551.0
mean,-1.482033
std,0.419732
min,-2.787443
25%,-1.760585
50%,-1.515749
75%,-1.239824
max,0.293132


In [26]:
#binarizing sentiment on -1.48 mean value
x_y['binary_sentiment'] = np.where(x_y['sentiment'] >= -1.52, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [27]:
#checking distribution of targets
x_y['binary_sentiment'].sum()

278

# TFIDF Vectorization

In [27]:
#check data
x_y.head()

Unnamed: 0,days,text_token,sentiment,binary_sentiment
0,2018-10-05,"['kuala', 'lumpur', 'oct', '4', 'thomson', 're...",-1.119873,1
1,2018-10-06,"['past', 'couple', 'week', 'see', 'mr.', 'trum...",-0.847089,1
2,2018-10-07,"['couple', 'contact', 'december', '2016', 'was...",-1.485399,1
3,2018-10-08,"['cheltenham', 'england', 'thomson', 'reuters'...",-0.894346,1
4,2018-10-09,"['stockholm', 'reuters', 'americans', 'william...",-0.762045,1


In [30]:
#set X, y
X = x_y['text_token']
y = x_y['binary_sentiment']

In [33]:
#test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y)

In [34]:
#instantiate and fit TF-IDF
tfidf = TfidfVectorizer()
tfidf.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [35]:
#transform data
X_train = tfidf.transform(X_train)
X_test = tfidf.transform(X_test)

In [36]:
#check shape
X_train.shape

(385, 147494)

In [37]:
#check shape
X_test.shape

(166, 147494)

In [38]:
#check object
X_train[0]

<1x147494 sparse matrix of type '<class 'numpy.float64'>'
	with 4475 stored elements in Compressed Sparse Row format>

# Neural Net with TF-IDF

- Epoch 10/10
39/39 [==============================] - 1s 13ms/step - loss: 4.0649 - accuracy: 0.5039 - val_loss: 3.1300 - val_accuracy: 0.5060

In [77]:
#convert TF-DF sparse to dense matrix
X_train = scipy.sparse.csr_matrix.todense(X_train)
X_test = scipy.sparse.csr_matrix.todense(X_test)

In [74]:
X_train.shape

(385, 147494)

In [83]:
#define and compile model
model = tf.keras.Sequential()
#model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
#model.add(text_vectorizer)
model.add(tf.keras.layers.Dense(32, input_shape=(X_train.shape[1],), activation='relu', kernel_regularizer=regularizers.l2(.1)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(32, input_shape=(X_train.shape[1],), activation='relu', kernel_regularizer=regularizers.l2(.1)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(y_train.nunique(), activation='sigmoid'))



model.compile(optimizer='adam',
             loss=tf.keras.losses.SparseCategoricalCrossentropy(),
             metrics=['accuracy'])

In [81]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_17 (Dense)             (None, 32)                4719840   
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_18 (Dense)             (None, 32)                1056      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_19 (Dense)             (None, 1)                 33        
Total params: 4,720,929
Trainable params: 4,720,929
Non-trainable params: 0
_________________________________________________________________


In [84]:
#train model
history = model.fit(X_train, y_train, epochs=10, verbose=1, batch_size=10, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Neural Net Regression

- Attempts to build model on actual brandwatch sentiment score (as opposed to binarized sentiment)
- r-squared: -0.001762950202293334 (horrible :)

In [70]:
#set new X and Y
x_y.head()
X= x_y['text_token']
y = x_y['sentiment']

In [None]:
#test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y)

In [81]:
#define and compile model
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(1))



model.compile(optimizer='adam',
             loss=tf.keras.losses.MeanSquaredError(),
            )

In [82]:
#fit model
history = model.fit(X_train, y_train, epochs=50, verbose=1, validation_data=(X_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [83]:
#predict and calculate r-squared
y_pred = model.predict(X_test)
r2_score(y_test,y_pred)

-0.001762950202293334