In [None]:
#install required dependencies
!pip install -q pyspark  # Installs PySpark library
!apt-get install -y openjdk-8-jdk-headless -qq > /dev/null  # Installs Java 8, required for Spark (Google Colab only, remove if not in Colab)

#download and extract Spark
!wget -q https://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz  #downloads Spark 3.3.0 (Google Colab only, remove if not in Colab)
!tar xf spark-3.3.0-bin-hadoop3.tgz  #extracts Spark tarball (Google Colab only, remove if not in Colab)

#set environment variables for Java and Spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"  #sets Java path (Google Colab only, adjust or remove if not in Colab)
os.environ["SPARK_HOME"] = "/content/spark-3.3.0-bin-hadoop3"  #sets Spark path (Google Colab only, adjust or remove if not in Colab)

#initialize Spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Colab Spark App") \
    .getOrCreate()

#verify Spark session
spark

In [None]:
# Mount Google Drive to access files
# REMOVE this cell if running outside Google Colab

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# df_train = spark.read.csv("train.csv", header=True, inferSchema=True)
df_train = spark.read.csv("/content/drive/MyDrive/train.csv", header=True, inferSchema=True)
# Use the above line only in Google Colab with Google Drive mounted.
# Remove it and uncomment the original line for local environments.
print("\nTrain")
df_train.show()

# df_test = spark.read.csv("test.csv", header=True, inferSchema=True)
df_test = spark.read.csv("/content/drive/MyDrive/test.csv", header=True, inferSchema=True)
# Use the above line only in Google Colab with Google Drive mounted.
# Remove it and uncomment the original line for local environments.
print("\nTest")
df_test.show()


Train
+-------+--------------------+------+
|     id|                text|author|
+-------+--------------------+------+
|id26305|This process, how...|   EAP|
|id17569|It never once occ...|   HPL|
|id11008|In his left hand ...|   EAP|
|id27763|How lovely is spr...|   MWS|
|id12958|Finding nothing e...|   HPL|
|id22965|A youth passed in...|   MWS|
|id09674|The astronomer, p...|   EAP|
|id13515|The surcingle hun...|   EAP|
|id19322|I knew that you c...|   EAP|
|id00912|I confess that ne...|   MWS|
|id16737|"He shall find th...|   MWS|
|id16607|Here we barricade...|   EAP|
|id19764|Herbert West need...|   HPL|
|id18886|The farm like gro...|   HPL|
|id17189|But a glance will...|   EAP|
|id12799|He had escaped me...|   MWS|
|id08441|To these speeches...|   EAP|
|id13117|Her native sprigh...|   MWS|
|id14862|I even went so fa...|   EAP|
|id20836|His facial aspect...|   HPL|
+-------+--------------------+------+
only showing top 20 rows


Test
+-------+--------------------+
|     id|         

In [None]:
# check for null
from pyspark.sql.functions import col, isnan, when, count

df_train.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in df_train.columns]).show()


+---+----+------+
| id|text|author|
+---+----+------+
|  0|   0|     0|
+---+----+------+



In [None]:
from pyspark.ml.feature import StopWordsRemover, Tokenizer
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.feature import Normalizer
from pyspark.ml import Pipeline

# Step 1: Tokenization
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
df_train_tokenized = tokenizer.transform(df_train)

# Step 2: Stop word removal
stopwords = StopWordsRemover.loadDefaultStopWords("english") + ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her']
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens", stopWords=stopwords)
df_train_filtered = remover.transform(df_train_tokenized)


df_train_filtered.show()

+-------+--------------------+------+--------------------+--------------------+
|     id|                text|author|              tokens|     filtered_tokens|
+-------+--------------------+------+--------------------+--------------------+
|id26305|This process, how...|   EAP|[this, process,, ...|[process,, howeve...|
|id17569|It never once occ...|   HPL|[it, never, once,...|[never, occurred,...|
|id11008|In his left hand ...|   EAP|[in, his, left, h...|[left, hand, gold...|
|id27763|How lovely is spr...|   MWS|[how, lovely, is,...|[lovely, spring, ...|
|id12958|Finding nothing e...|   HPL|[finding, nothing...|[finding, nothing...|
|id22965|A youth passed in...|   MWS|[a, youth, passed...|[youth, passed, s...|
|id09674|The astronomer, p...|   EAP|[the, astronomer,...|[astronomer,, per...|
|id13515|The surcingle hun...|   EAP|[the, surcingle, ...|[surcingle, hung,...|
|id19322|I knew that you c...|   EAP|[i, knew, that, y...|[knew, say, 'ster...|
|id00912|I confess that ne...|   MWS|[i,

In [None]:
#Stage 2: Feature Extraction

#TF-IDF Feature Extraction
from pyspark.ml.feature import CountVectorizer, IDF

#convert filtered tokens to term frequency vectors
count_vectorizer = CountVectorizer(inputCol="filtered_tokens", outputCol="raw_features", vocabSize=10000, minDF=5)

#set IDF to scale term frequencies by inverse document frequency
idf = IDF(inputCol="raw_features", outputCol="tfidf_features")

#normalize the TF-IDF features
from pyspark.ml.feature import Normalizer

#normalize the TF-IDF features using L2 norm (len)
normalizer = Normalizer(inputCol="tfidf_features", outputCol="normalized_features", p=2.0)

#build and apply pipeline
from pyspark.ml import Pipeline

#create pipeline with existing tokenizer, stop word remover, and new stages
pipeline = Pipeline(stages=[
    tokenizer,
    remover,
    count_vectorizer,
    idf,
    normalizer
])

#fit pipeline to training data
pipeline_model = pipeline.fit(df_train)

#transform training data
df_train_transformed = pipeline_model.transform(df_train)

#show transformed training data with added feature columns
df_train_transformed.select("id", "text", "author", "filtered_tokens", "raw_features", "tfidf_features", "normalized_features").show()

#apply pipeline to the test data
df_test_transformed = pipeline_model.transform(df_test)

#show transformed test data
df_test_transformed.select("id", "text", "filtered_tokens", "raw_features", "tfidf_features", "normalized_features").show()

+-------+--------------------+------+--------------------+--------------------+--------------------+--------------------+
|     id|                text|author|     filtered_tokens|        raw_features|      tfidf_features| normalized_features|
+-------+--------------------+------+--------------------+--------------------+--------------------+--------------------+
|id26305|This process, how...|   EAP|[process,, howeve...|(9079,[4,9,33,47,...|(9079,[4,9,33,47,...|(9079,[4,9,33,47,...|
|id17569|It never once occ...|   HPL|[never, occurred,...|(9079,[4,10,231,7...|(9079,[4,10,231,7...|(9079,[4,10,231,7...|
|id11008|In his left hand ...|   EAP|[left, hand, gold...|(9079,[48,87,135,...|(9079,[48,87,135,...|(9079,[48,87,135,...|
|id27763|How lovely is spr...|   MWS|[lovely, spring, ...|(9079,[85,88,420,...|(9079,[85,88,420,...|(9079,[85,88,420,...|
|id12958|Finding nothing e...|   HPL|[finding, nothing...|(9079,[2,67,143,3...|(9079,[2,67,143,3...|(9079,[2,67,143,3...|
|id22965|A youth passed 