In [6]:
!pip install pyspark




In [28]:
from google.colab import files
uploaded = files.upload()  # Select the kaggle.json file from your computer


Saving kaggle.json to kaggle.json


In [29]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/"


In [30]:
!pip install kaggle
!kaggle competitions download -c spooky-author-identification


Downloading spooky-author-identification.zip to /content
  0% 0.00/1.81M [00:00<?, ?B/s]
100% 1.81M/1.81M [00:00<00:00, 54.6MB/s]


In [32]:
import zipfile

with zipfile.ZipFile('spooky-author-identification.zip', 'r') as zip_ref:
    zip_ref.extractall('Datasets')


In [34]:
# Step 1: Import necessary libraries
from google.colab import files
import os
import shutil

# Step 2: Upload files
uploaded = files.upload()  # Choose train.csv and test.csv when prompted

# Step 3: Create a folder named 'Datasets'
os.makedirs('Datasets', exist_ok=True)  # This creates the folder if it doesn't exist

# Step 4: Move the uploaded files to the 'Datasets' folder
for filename in uploaded.keys():
    shutil.move(filename, f'Datasets/{filename}')  # Move each uploaded file to the Datasets folder

# Step 5: Check that the files are now in the 'Datasets' folder
print("Files moved to 'Datasets' folder:")
print(os.listdir('Datasets'))  # List the files in the Datasets folder


Saving test.zip to test (2).zip
Files moved to 'Datasets' folder:
['sample_submission.csv', 'test.zip', 'train.zip', 'test (2).zip', 'sample_submission.zip']


In [35]:
import zipfile

# Step 1: Extract train.zip
with zipfile.ZipFile('Datasets/train.zip', 'r') as zip_ref:
    zip_ref.extractall('Datasets/train')

# Step 2: Extract test.zip
with zipfile.ZipFile('Datasets/test.zip', 'r') as zip_ref:
    zip_ref.extractall('Datasets/test')

# Step 3: Check the extracted files
print("Train Files:", os.listdir('Datasets/train'))
print("Test Files:", os.listdir('Datasets/test'))

# Step 4: Create a Spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SpookyAuthorIdentification").getOrCreate()

# Step 5: Load training data into a DataFrame
try:
    train_df = spark.read.csv('Datasets/train/train.csv', header=True, inferSchema=True)
    train_df.printSchema()
    train_df.show(5)
except Exception as e:
    print("Error loading the CSV file:", e)

# Step 6: Load test data into a DataFrame
try:
    test_df = spark.read.csv('Datasets/test/test.csv', header=True, inferSchema=True)
    test_df.printSchema()
    test_df.show(5)
except Exception as e:
    print("Error loading the CSV file:", e)


Train Files: ['train.csv']
Test Files: ['test.csv']
root
 |-- id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- author: string (nullable = true)

+-------+--------------------+------+
|     id|                text|author|
+-------+--------------------+------+
|id26305|This process, how...|   EAP|
|id17569|It never once occ...|   HPL|
|id11008|In his left hand ...|   EAP|
|id27763|How lovely is spr...|   MWS|
|id12958|Finding nothing e...|   HPL|
+-------+--------------------+------+
only showing top 5 rows

root
 |-- id: string (nullable = true)
 |-- text: string (nullable = true)

+-------+--------------------+
|     id|                text|
+-------+--------------------+
|id02310|Still, as I urged...|
|id24541|If a fire wanted ...|
|id00134|And when they had...|
|id27757|While I was think...|
|id04081|I am not sure to ...|
+-------+--------------------+
only showing top 5 rows



In [36]:
#Print num rows and cols in the training set
train_row_count = train_df.count()
train_column_count = len(train_df.columns)
print(f"Training Data - Rows: {train_row_count}, Columns: {train_column_count}")

Training Data - Rows: 19579, Columns: 3


In [37]:
#Print num rows and cols in the test set
test_row_count = test_df.count()
test_column_count = len(test_df.columns)
print(f"Test Data - Rows: {test_row_count}, Columns: {test_column_count}")

Test Data - Rows: 8392, Columns: 2


In [38]:
#Check for missing values (There are none)
train_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in train_df.columns]).show()
test_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in test_df.columns]).show()

+---+----+------+
| id|text|author|
+---+----+------+
|  0|   0|     0|
+---+----+------+

+---+----+
| id|text|
+---+----+
|  0|   0|
+---+----+



In [39]:
#Create sentence length column
train_df = train_df.withColumn("sentence_length", F.length(F.col("text")))
train_df.select("sentence_length").describe().show()

+-------+------------------+
|summary|   sentence_length|
+-------+------------------+
|  count|             19579|
|   mean|139.99765054395016|
| stddev|101.25452331007808|
|    min|                 5|
|    max|              3682|
+-------+------------------+



In [40]:
#Use tokenizer to tokenize the text
tokenizer = Tokenizer(inputCol="text", outputCol="words")
tokenized_train_df = tokenizer.transform(train_df)

#Print tokenized output
tokenized_train_df.select("text", "words").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                                                                   |words                                                                                                                                                                                                                                                              

In [41]:
#Create an instance of StopWordsRemover and use it to filter the tokens and remove stop words
#The new filtered tokens are placed in a new column "filtered_words"
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
cleaned_train_df = remover.transform(tokenized_train_df)

#Print the cleaned output
cleaned_train_df.select("words", "filtered_words").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|words                                                                                                                                                                                                                                                                            |filtered_words                                                                                                                                                                              |
+---------------------------------------------------------------------

# Satge 2 Feature Extraction:
we will convert the cleaned text into numerical features using TF-IDF (Term Frequency-Inverse Document Frequency).

In [42]:
from pyspark.ml.feature import HashingTF, IDF  #import libraray


In [43]:
# Create the HashingTF instance to map a text to its term frequencies
hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=10000)
featurized_train_df = hashingTF.transform(cleaned_train_df)

# Show the raw features
featurized_train_df.select("id", "rawFeatures").show(5, truncate=False)


+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id     |rawFeatures                                                                                                                                                                                                 |
+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id26305|(10000,[92,635,683,894,1717,2178,2254,3072,3117,3509,3525,3924,4043,4086,4416,5342,5724,5875,6159,7927,8694,9460],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|id17569|(10000,[281,3924,3996,5819,6611,8713],[1.0,1.0,1.0,1.0,1.0,1.0])                                                                   

In [44]:
# Compute the IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurized_train_df)
rescaled_train_df = idfModel.transform(featurized_train_df)

# Show the resulting DataFrame with features
rescaled_train_df.select("id", "features").show(truncate=False)


+-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id     |features                                            

In [45]:
training_data = rescaled_train_df.select("features", "author")
