# Spooky Authorship
## Team 9
### Corey Lovette, Jason Lerner, Jason Perkins, Savitha Samudrala

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CSV Loader") \
    .getOrCreate()



In [3]:
df_train = spark.read.csv("train.csv", header=True, inferSchema=True)
print("\nTrain")
df_train.show()

df_test = spark.read.csv("test.csv", header=True, inferSchema=True)
print("\nTest")
df_test.show()


Train
+-------+--------------------+------+
|     id|                text|author|
+-------+--------------------+------+
|id26305|This process, how...|   EAP|
|id17569|It never once occ...|   HPL|
|id11008|In his left hand ...|   EAP|
|id27763|How lovely is spr...|   MWS|
|id12958|Finding nothing e...|   HPL|
|id22965|A youth passed in...|   MWS|
|id09674|The astronomer, p...|   EAP|
|id13515|The surcingle hun...|   EAP|
|id19322|I knew that you c...|   EAP|
|id00912|I confess that ne...|   MWS|
|id16737|"He shall find th...|   MWS|
|id16607|Here we barricade...|   EAP|
|id19764|Herbert West need...|   HPL|
|id18886|The farm like gro...|   HPL|
|id17189|But a glance will...|   EAP|
|id12799|He had escaped me...|   MWS|
|id08441|To these speeches...|   EAP|
|id13117|Her native sprigh...|   MWS|
|id14862|I even went so fa...|   EAP|
|id20836|His facial aspect...|   HPL|
+-------+--------------------+------+
only showing top 20 rows


Test
+-------+--------------------+
|     id|         

In [7]:
# check for null
from pyspark.sql.functions import col, isnan, when, count

df_train.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in df_train.columns]).show()


+---+----+------+
| id|text|author|
+---+----+------+
|  0|   0|     0|
+---+----+------+



In [12]:
from pyspark.ml.feature import StopWordsRemover, Tokenizer
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.feature import Normalizer
from pyspark.ml import Pipeline

# Step 1: Tokenization
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
df_train_tokenized = tokenizer.transform(df_train)

# Step 2: Stop word removal
stopwords = StopWordsRemover.loadDefaultStopWords("english") + ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her']
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens", stopWords=stopwords)
df_train_filtered = remover.transform(df_train_tokenized)


df_train_filtered.show()

+-------+--------------------+------+--------------------+--------------------+
|     id|                text|author|              tokens|     filtered_tokens|
+-------+--------------------+------+--------------------+--------------------+
|id26305|This process, how...|   EAP|[this, process,, ...|[process,, howeve...|
|id17569|It never once occ...|   HPL|[it, never, once,...|[never, occurred,...|
|id11008|In his left hand ...|   EAP|[in, his, left, h...|[left, hand, gold...|
|id27763|How lovely is spr...|   MWS|[how, lovely, is,...|[lovely, spring, ...|
|id12958|Finding nothing e...|   HPL|[finding, nothing...|[finding, nothing...|
|id22965|A youth passed in...|   MWS|[a, youth, passed...|[youth, passed, s...|
|id09674|The astronomer, p...|   EAP|[the, astronomer,...|[astronomer,, per...|
|id13515|The surcingle hun...|   EAP|[the, surcingle, ...|[surcingle, hung,...|
|id19322|I knew that you c...|   EAP|[i, knew, that, y...|[knew, say, 'ster...|
|id00912|I confess that ne...|   MWS|[i,