In [None]:
####  Install Java e PySpark (run it only once)

!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!pip install pyspark -q

In [None]:
####  Java Environment

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

In [None]:
#### Starting Spark Session

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("FakeNewsFromDrive").getOrCreate()
spark

In [None]:
#### Google drive for repository (ask Antonio if you don't know what to do locally) - Using this just to colab dev. It'll be adapted for Databricks delivery.

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
project_path = "/content/drive/MyDrive/fake_news_project"
print(f"Using path: {project_path}")

Using path: /content/drive/MyDrive/fake_news_project


In [None]:
#### Reading and joining CSVs & setting fake or real label

df_fake = spark.read.csv(f"{project_path}/fake.csv", header=True, inferSchema=True)
df_real = spark.read.csv(f"{project_path}/real.csv", header=True, inferSchema=True)

from pyspark.sql.functions import lit
df_fake = df_fake.withColumn("label", lit(0))
df_real = df_real.withColumn("label", lit(1))
df = df_fake.unionByName(df_real).select("text", "label").na.drop()

df.show(5)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|Donald Trump just...|    0|
|House Intelligenc...|    0|
|On Friday, it was...|    0|
|On Christmas day,...|    0|
|Pope Francis used...|    0|
+--------------------+-----+
only showing top 5 rows



In [None]:
#### Preprocessing and split (train, test)

from pyspark.sql.functions import lower, regexp_replace

## removing everything that is not a-zA-Z + blanks. Then lowering text
df = df.withColumn("text", lower(regexp_replace("text", "[^a-zA-Z\s]", "")))

train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [None]:
#### RandomForest with Cross Validation Pipeline

from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## Text Mining
tokenizer = Tokenizer(inputCol="text", outputCol="words") ## Tokenizing (text to words list)
remover = StopWordsRemover(inputCol="words", outputCol="filtered") ## Removing Stop Words from list
tf = HashingTF(inputCol="filtered", outputCol="rawFeatures") ## Hashing vector
idf = IDF(inputCol="rawFeatures", outputCol="features") ## weight adjustment

rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=20)

pipeline = Pipeline(stages=[tokenizer, remover, tf, idf, rf])

paramGrid = ParamGridBuilder() \
    .addGrid(tf.numFeatures, [1000, 5000]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=3)

cv_model = cv.fit(train_data)
predictions = cv_model.transform(test_data)

auc = evaluator.evaluate(predictions)
print(f"Random Forest AUC + Cross Validation: {auc:.4f}")

Random Forest AUC + Cross Validation: 0.9954


In [None]:
#### Accuracy evaluation

correct_preds = predictions.filter("label = prediction").count()
total_preds = predictions.count()
accuracy = correct_preds / total_preds
print(f"✅ Acuraccy: {accuracy:.4f}")

✅ Acuraccy: 0.9724


In [None]:
#### Saving best model

## Google drive best_model path
model_path = "/content/drive/MyDrive/fake_news_project/best_model"

## Saving CV Best Model with best parameters
cv_model.bestModel.write().overwrite().save(model_path)

print("Best Model saved:", model_path)

Best Model saved: /content/drive/MyDrive/fake_news_project/best_model
