In [1]:
import os
src_path = os.getcwd()

In [2]:
from pyspark.context import SparkContext
sc = SparkContext('local', 'test')

In [3]:
from pyspark.sql import SQLContext
sql = SQLContext(sc)

In [4]:
from pyspark.sql.functions import lit
dem_df = sql.read.text('file://'+src_path+'/dem.txt')
gop_df = sql.read.text('file://'+src_path+'/gop.txt')
corpus_df = dem_df.select('value', lit(1).alias('label')).union(gop_df.select('value', lit(0).alias('label')))

In [5]:
train_df, test_df = corpus_df.randomSplit([0.75, 0.25])

In [6]:
corpus_df.select("*").limit(3).show()

+--------------------+-----+
|               value|label|
+--------------------+-----+
|A very merry Chri...|    1|
|Stay safe and pro...|    1|
|RT @ossoff: We ca...|    1|
+--------------------+-----+



In [7]:
from pyspark.ml.feature import StopWordsRemover
stop_words = [word.strip() for word in open(src_path + '/stop_words.txt').readlines()]

In [9]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import CountVectorizer, Tokenizer, StopWordsRemover

tokenizer = Tokenizer(inputCol='value', outputCol='words')
stop_words_remover = StopWordsRemover(inputCol='words', outputCol='words_cleaned')
vectorizer = CountVectorizer(inputCol='words_cleaned', outputCol='features')
logistic_regression = LogisticRegression(featuresCol='features', labelCol='label')

In [10]:
from pyspark.ml import Pipeline
cleaning_pipeline = Pipeline(stages=[tokenizer, stop_words_remover, vectorizer])
cleaning_pipeline_model = cleaning_pipeline.fit(corpus_df)

cleaned_training_df = cleaning_pipeline_model.transform(train_df)
cleaned_testing_df = cleaning_pipeline_model.transform(test_df)

cleaned_training_df.show(n=5)

+--------------------+-----+--------------------+--------------------+--------------------+
|               value|label|               words|       words_cleaned|            features|
+--------------------+-----+--------------------+--------------------+--------------------+
|.@DebHaalandNM ha...|    1|[.@debhaalandnm, ...|[.@debhaalandnm, ...|(2616,[22,64,94,2...|
|.@DenisMcDonough ...|    1|[.@denismcdonough...|[.@denismcdonough...|(2616,[11,21,50,8...|
|.@JanetYellen is ...|    1|[.@janetyellen, i...|[.@janetyellen, o...|(2616,[22,33,61,9...|
|.@JoeBiden and @K...|    1|[.@joebiden, and,...|[.@joebiden, @kam...|(2616,[7,15,30,31...|
|.@JoeBiden and @K...|    1|[.@joebiden, and,...|[.@joebiden, @kam...|(2616,[4,7,10,15,...|
+--------------------+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [12]:
logistic_model = logistic_regression.fit(cleaned_training_df)
predictions_df = logistic_model.transform(cleaned_testing_df)
predictions_df.select('features', 'label', 'prediction').limit(3).show()

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(2616,[4,7,8,29,3...|    1|       1.0|
|(2616,[7,8,30,50,...|    1|       1.0|
|(2616,[7,30,42,44...|    1|       1.0|
+--------------------+-----+----------+



In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='label',
                                              predictionCol='prediction',
                                              metricName='accuracy')
evaluator.evaluate(predictions_df)

0.9247311827956989