# **WallStreetBets DD Recommender**
## Notebook for Naive Bayes classifier with CountVectorizer.

In [None]:
#Installing all packages
!pip install numpy
!pip install matplotlib
!pip install scikit-learn
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 68kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 39.9MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=25dfc14800e4a1c2764ec88677cddd257b921a676ea5cf4def1e0a3170702a2f
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


### Importing libraries for data processing

In [None]:
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark import SparkContext as sc
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import udf,col
from pyspark.sql.types import ArrayType
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType

# tools
import random

### Importing libraries for Machine learning models

In [None]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [None]:
"""
Initialize Spark session object
"""
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark Naive Bayes CountVectorizer") \
        .getOrCreate()
    return spark
spark = init_spark()

### Data preprocessing: Remove stop words and feature extraction

In [None]:
#Read lemmatized dataset created by WallStreetBets-CreateLemmas.ipynb 
data = spark.read.csv("lemma.csv", header=True)

#id,label,lemmas
#ks1tzw,1,all|right|artist|.....
function_array = udf(lambda r: r.split("|"), ArrayType(StringType()))
function_toNumerical = udf(lambda r: int(r), IntegerType())
text_lemmas = data.withColumn('finished_lemmas', function_array('text')).drop('text').withColumn('label', function_toNumerical('label'))
print("Number of rows: ",text_lemmas.count())

Number of rows:  1144


In [None]:
'''
Get the Corpus.
Removing stop words from the text lemmas. 
'''
remover = StopWordsRemover(inputCol="finished_lemmas", outputCol="text")
filtered_df = remover.transform(text_lemmas)

In [None]:
'''
Create Document-Term Matrix by vectorizing the filtered text.
- returns the features column: 
(total nb of words, indices of each word in total vocab, count of each word)
'''
to_vectorize = filtered_df.select('id', 'label', 'text')
cv = CountVectorizer(inputCol="text", outputCol="features")
model_vec = cv.fit(to_vectorize)
result_vec = model_vec.transform(to_vectorize)
print("Total count of vocabulary:", len(model_vec.vocabulary))
selectedData = result_vec.select('id', 'label','features', 'text')

Total count of vocabulary: 10933


### Building Naive Bayes classifier

In [None]:
"""
Define TruePositive, FalsePositive and FalseNegative
x = prediction, y = label
"""
TP = udf(lambda x,y: int(x==1 and y==1))
FP = udf(lambda x,y: int(x==1 and y==0))
FN = udf(lambda x,y: int(x==0 and y==1))

In [None]:
'''
Naive-Bayes following from CountVectorizer
'''
def NAIVEBAYES_CV(smooth=1, model_type="multinomial"): 
  # separating train/test data
  training_zero, test_zero = selectedData.where(selectedData.label == 0).randomSplit([0.7, 0.3])
  training_one, test_one = selectedData.where(selectedData.label == 1).randomSplit([0.7, 0.3])

  training = training_zero.union(training_one)
  test = test_zero.union(test_one)

  # create trainer with parameters then train
  # smoothing: smooth probabilities of 0 to the input
  nb = NaiveBayes(smoothing=smooth, modelType=model_type)
  model_NB = nb.fit(training)

  # display on test set: appends a prediction column
  predictions = model_NB.transform(test)

  # diagnostic testing
  prela_df = predictions.select("prediction","label")
  prela_df=prela_df.withColumn("TP", TP(prela_df.prediction,prela_df.label))
  prela_df=prela_df.withColumn("FP", FP(prela_df.prediction,prela_df.label))
  prela_df=prela_df.withColumn("FN", FN(prela_df.prediction,prela_df.label))

  TP_ = prela_df.where(prela_df.TP==1).count()
  FP_ = prela_df.where(prela_df.FP==1).count()
  FN_ = prela_df.where(prela_df.FN==1).count()

  precision = TP_/(TP_+FP_)
  recall = TP_/(TP_+FN_)
  F1 = 2*(precision*recall)/(precision+recall)

  # compute accuracy of on test set: compares labelCol and predictionCol
  evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
  accuracy = evaluator.evaluate(predictions)

  # return test results and model object
  return (accuracy,precision,recall,F1,model_NB)

In [None]:
"""
Examples
"""
acc,precision,recall,F1,modelNB = NAIVEBAYES_CV(0.2684835187532758,"complement")
print("Accuracy: ",acc)
print("Precision: ",precision)
print("Recall: ",recall)
print("F1 Score: ",F1)
print()
acc,precision,recall,F1,modelNB2 = NAIVEBAYES_CV(0.2684835187532758,"multinomial")
print("Accuracy: ",acc)
print("Precision: ",precision)
print("Recall: ",recall)
print("F1 Score: ",F1)

Accuracy:  0.6130790190735694
Precision:  0.589041095890411
Recall:  0.712707182320442
F1 Score:  0.645

Accuracy:  0.625748502994012
Precision:  0.5959595959595959
Recall:  0.7239263803680982
F1 Score:  0.6537396121883657


### Testing with different model types and random smooth value

In [None]:
'''
Iteration tests on Naive-Bayes

iter_total: iterations for different smoothing nb
iter_each: iterations for the same smoothing nb
'''
import statistics

extract_method = "CountVectorizer"
iter_each = 10
iter_total = 50
m_types = ["complement", "multinomial"]
accs = []
f1s = []
for model_type in m_types:
  for k in range(iter_total):
    accuracies = []
    smoothing = random.uniform(0.01, 0.8)
    for i in range(iter_each):
      acc,precision,recall,F1,modelNB = NAIVEBAYES_CV(smoothing, model_type)
      accs.append(acc)
      f1s.append(F1)
    mean_acc = statistics.mean(accs)
    mean_f1 = statistics.mean(f1s)
    print("=> Mean_acc: ", mean_acc," => Mean_f1: ",mean_f1, "- Smoothing:", smoothing, "- Model:", model_type)
    means.append((mean_acc,mean_f1, smoothing, model_type, extract_method))

=> Mean_acc:  0.6079048522164284  => Mean_f1:  0.6393361797046069 - Smoothing: 0.08412955728123993 - Model: complement
=> Mean_acc:  0.6064216016547912  => Mean_f1:  0.6328262146033021 - Smoothing: 0.06817815498039603 - Model: complement
=> Mean_acc:  0.6070998909594324  => Mean_f1:  0.6348180274709192 - Smoothing: 0.5746608805970586 - Model: complement
=> Mean_acc:  0.6061228937081395  => Mean_f1:  0.6364310647236601 - Smoothing: 0.6579209137657336 - Model: complement
=> Mean_acc:  0.6072693347031953  => Mean_f1:  0.6365346408691858 - Smoothing: 0.7500617358310381 - Model: complement
=> Mean_acc:  0.6042366473040159  => Mean_f1:  0.6355815049626126 - Smoothing: 0.32451570157883275 - Model: complement
=> Mean_acc:  0.6049446181875157  => Mean_f1:  0.636565527559283 - Smoothing: 0.3458478280482886 - Model: complement
=> Mean_acc:  0.6070204516748807  => Mean_f1:  0.6380786694401938 - Smoothing: 0.7567512379372225 - Model: complement
=> Mean_acc:  0.6065041827443748  => Mean_f1:  0.63796