In [None]:
# Loading Dependencies
from path import Path
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#load VADER
analyzer = SentimentIntensityAnalyzer()

In [None]:
# Create SQL engine

from sqlalchemy import create_engine
engine = create_engine('sqlite://', echo=False)

In [None]:
# Loading Data
data = Path('Resources/reddit.csv')
reddit_df = pd.read_csv(data)
reddit_df.head()

In [None]:
# Checking DTypes
reddit_df.dtypes

In [None]:
# Look at subreddit value counts
subreddit = reddit_df.subreddit.value_counts()
subreddit

In [None]:
# Look at body value counts
body = reddit_df.body.value_counts()
body

In [None]:
# Read reddit_df dataframe to new SQL database. Currently set to replace on each run, until we can decide how 
# we want it to append with new CSV info in the future.

reddit_df.to_sql('reddit_raw_db', con=engine, if_exists='replace')
engine.execute("SELECT * FROM reddit_raw_db").fetchmany(size=20)

In [None]:
# Remove the 'body' column.
reddit_df.drop(['body'], axis=1, inplace=True)
reddit_df.head()

In [None]:
# Remove rows that have at least 1 null value.
reddit_df.dropna()

In [None]:
# Checking Sentiment Scores
def sentiment_analyzer_scores(sentence):
    score = analyzer.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(score)))
    
sentiment_analyzer_scores('UPVOTE so everyone sees we got SUPPORT')

In [None]:
#Add VADER metrics to dataframe
reddit_df['compound'] = [analyzer.polarity_scores(v)['compound'] for v in reddit_df['title']]

reddit_df['neg'] = [analyzer.polarity_scores(v)['neg'] for v in reddit_df['title']]

reddit_df['neu'] = [analyzer.polarity_scores(v)['neu'] for v in reddit_df['title']]

reddit_df['pos'] = [analyzer.polarity_scores(v)['pos'] for v in reddit_df['title']]

reddit_df.head()

In [None]:
reddit_df.to_csv('Resources/reddit2.csv')

In [None]:
reddit_groups = reddit_df.groupby("subreddit")

In [None]:
# Grouping Vader Scores for each Subreddit
reddit_groups.mean()

In [None]:
reddit_df = reddit_df[reddit_df["subreddit"]!="stocks"]
reddit_df.tail()

In [None]:
# Changing Subreddit values to be 1 if from WSB and 0 if not.
reddit_df['subreddit'] = reddit_df['subreddit'].apply(lambda x:1 if x == "wallstreetbets" else 0)

In [None]:
reddit_df.head()

In [None]:
reddit_df.tail()

In [None]:
# Read the cleaned and VADER modified reddit_df dataframe to new SQL database. Currently set to replace on each run, 
# until we can decide how we want it to append with new CSV info in the future.


reddit_df.to_sql('reddit_cleaned_with_VADER_db', con=engine, if_exists='replace')
engine.execute("SELECT * FROM reddit_cleaned_with_VADER_db").fetchmany(size=20)

In [None]:
# Read from DB into a dataframe for ML model

reddit_cleaned_with_VADER_df = pd.read_sql("reddit_cleaned_with_VADER_db", con=engine)

In [None]:
reddit_cleaned_with_VADER_df.head()

In [None]:
# Create combined_db with JOIN via SQL

engine.execute("DROP TABLE IF EXISTS combined_db")
engine.execute("CREATE TABLE combined_db AS SELECT * FROM reddit_raw_db JOIN reddit_cleaned_with_VADER_db ON reddit_raw_db.title = reddit_cleaned_with_VADER_db.title")
engine.execute("SELECT * FROM combined_db").fetchmany(size=20)

In [None]:
# Creating X and Y sets
y = reddit_cleaned_with_VADER_df["subreddit"]
X = reddit_cleaned_with_VADER_df.drop(columns=["index","subreddit","title","url","date","score","num_comments"])

In [None]:
# Breaking sets into train and test.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

In [None]:
y.value_counts()

In [None]:
# NOT NEEDED WHEN TAKING OUT SCORE AND NUM_COMMENTS
# create scaler instances
#scaler = StandardScaler()

# fit scaler
#X_scaler = scaler.fit(X_train)

# scale data
#X_train_scaled = X_scaler.transform(X_train)
#X_test_scaled = X_scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [None]:
classifier.fit(X_train, y_train)
#classifier.fit(X_train_scaled, y_train)

In [None]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

# Optimization Attempt #1 - Add Additional Hidden Layers

In [None]:
#X_train_scaled.shape[1]
X_train.shape[1]


In [None]:
# define model
number_input_features = X_train.shape[1]
#number_input_features = X_train_scaled.shape[1]
hidden_nodes_layer1 = 80
hidden_nodes_layer2 = 30
hidden_nodes_layer3 = 20

nn = tf.keras.models.Sequential()

# first hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation='relu'))

# second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='relu'))

#third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation='relu'))

# output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# check structure of model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
fit_model = nn.fit(X_train, y_train, epochs=100)
#fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
#model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

# Optimization Attempt #2: Adding Neurons in Layers & tanh for relu

In [None]:
# define model
number_input_features = X_train.shape[1]
hidden_nodes_layer1 = 90
hidden_nodes_layer2 = 40
hidden_nodes_layer3 = 30


nn = tf.keras.models.Sequential()

# first hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation='tanh'))

# second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='tanh'))

#third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation='tanh'))

# output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# check structure of model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
fit_model = nn.fit(X_train, y_train, epochs=63)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

### Optimization Attempt #3: Change Activation Function w/ Callback

In [None]:
# define checkpoint path and filenames
import os
os.makedirs("checkpoints/",exist_ok=True)
checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

In [None]:
# define model
number_input_features = X_train.shape[1]
hidden_nodes_layer1 = 80
hidden_nodes_layer2 = 30
hidden_nodes_layer3 = 20


nn = tf.keras.models.Sequential()

# first hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation='relu'))

# second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='relu'))

# output layer
nn.add(tf.keras.layers.Dense(units=1, activation='tanh'))

# check structure of model
nn.summary()

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

# compile model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# create callback that saves weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=1000)

# train model
fit_model = nn.fit(X_train,y_train,epochs=100,callbacks=[cp_callback])

# evaluate model using test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")