In [3]:
#Installing pyspark
!pip install pyspark

#Initializing important libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, when
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

#Initializing spark session
spark = SparkSession.builder \
    .appName("Fake News Classification") \
    .getOrCreate()

#Loading the data
true_df = spark.read.csv('true.csv', header=True, inferSchema=True)
fake_df = spark.read.csv('fake.csv', header=True, inferSchema=True)

#Adding labels
true_df = true_df.withColumn("label", pyspark.sql.functions.lit(1))
fake_df = fake_df.withColumn("label", pyspark.sql.functions.lit(0))

#Combining datasets
data_df = true_df.union(fake_df)

#Droping duplicates
data_df = data_df.dropDuplicates()

#Droping unnecessary columns
data_df = data_df.drop('date')

#Converting to lowercase
data_df = data_df.withColumn('title', lower(col('title')))
data_df = data_df.withColumn('text', lower(col('text')))

#Cleaning thr text
data_df = data_df.withColumn('text', regexp_replace(col('text'), r"http\S+|www.\S+", ""))
data_df = data_df.withColumn('text', regexp_replace(col('text'), r"[^a-zA-Z0-9\s]+", ""))
data_df = data_df.withColumn('text', regexp_replace(col('text'), r'\w*\d\w*', ''))
data_df = data_df.withColumn('text', when(col('text').isNull(), '').otherwise(col('text')))

#Tokenizing the text
tokenizer = Tokenizer(inputCol="text", outputCol="words")
words_data = tokenizer.transform(data_df)

#Removing stopwords
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
cleaned_data = remover.transform(words_data)

#Computing term frequencies
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
featurized_data = hashingTF.transform(cleaned_data)

#Computing inverse document frequencies
idf = IDF(inputCol="rawFeatures", outputCol="features")
idf_model = idf.fit(featurized_data)
rescaled_data = idf_model.transform(featurized_data)

#Indexing labels
label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaled_data)
data_indexed = label_indexer.transform(rescaled_data)

#Spliting the data
(training_data, test_data) = data_indexed.randomSplit([0.8, 0.2], seed=42)

#Creating and training the RandomForest model
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=100)
model = rf.fit(training_data)

#Making predictions
predictions = model.transform(test_data)

#Evaluating the model
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f'Accuracy: {accuracy}')

#Showing confusion matrix
predictions.groupBy("indexedLabel", "prediction").count().show()

#Calculating results of the method and interpretation of success
evaluator_precision = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator_precision.evaluate(predictions)
print(f'Precision: {precision}')

evaluator_recall = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator_recall.evaluate(predictions)
print(f'Recall: {recall}')

evaluator_f1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
f1 = evaluator_f1.evaluate(predictions)
print(f'F1 Score: {f1}')

#Stop the Spark session
spark.stop()


[31mERROR: Operation cancelled by user[0m[31m
[0mAccuracy: 0.9707462004342361
+------------+----------+-----+
|indexedLabel|prediction|count|
+------------+----------+-----+
|         1.0|       1.0| 4020|
|         0.0|       1.0|  114|
|         1.0|       0.0|  142|
|         0.0|       0.0| 4475|
+------------+----------+-----+

Precision: 0.9707563743265885
Recall: 0.9707462004342361
F1 Score: 0.9707413205221784
