In [15]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("SpookyAuthorIdentification").getOrCreate()

# Load the CSV data into a Spark DataFrame
file_path = "test.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the schema and the first few rows of the Spark DataFrame
df.printSchema()
df.show(5, truncate=False)


root
 |-- id: string (nullable = true)
 |-- text: string (nullable = true)

+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id     |text                                                                                                                                                                                                                                                                                                                                      |
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [19]:
from pyspark.ml.feature import Tokenizer

# Tokenize the text column
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
df_tokens = tokenizer.transform(df)

# Show the tokenized text
df_tokens.select("tokens").show(5, truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|tokens                                                                                                                                                                                                                                                                                                                                                                                                   |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [20]:
from pyspark.ml.feature import StopWordsRemover

# Customize stopwords list to include pronouns
stopwords = StopWordsRemover.loadDefaultStopWords("english") + ['I', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her']

# Remove stopwords from the tokenized text
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens", stopWords=stopwords)
df_cleaned = remover.transform(df_tokens)

# Show the cleaned tokens
df_cleaned.select("filtered_tokens").show(5, truncate=False)


+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|filtered_tokens                                                                                                                                                                                                                                  |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[still,, urged, leaving, ireland, inquietude, impatience,, father, thought, best, yield.]                                                                                                                                                        |
|[fire, wanted, fanning,