In [51]:
!pip install pyspark




In [50]:
from google.colab import files
uploaded = files.upload()  # Select the kaggle.json file from your computer


Saving kaggle.json to kaggle (2).json


In [52]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/"


In [53]:
!pip install kaggle
!kaggle competitions download -c spooky-author-identification


spooky-author-identification.zip: Skipping, found more recently modified local copy (use --force to force download)


In [54]:
import zipfile

with zipfile.ZipFile('spooky-author-identification.zip', 'r') as zip_ref:
    zip_ref.extractall('Datasets')


In [55]:
# Step 1: Import necessary libraries
from google.colab import files
import os
import shutil

# Step 2: Upload files
uploaded = files.upload()  # Choose train.csv and test.csv when prompted

# Step 3: Create a folder named 'Datasets'
os.makedirs('Datasets', exist_ok=True)  # This creates the folder if it doesn't exist

# Step 4: Move the uploaded files to the 'Datasets' folder
for filename in uploaded.keys():
    shutil.move(filename, f'Datasets/{filename}')  # Move each uploaded file to the Datasets folder

# Step 5: Check that the files are now in the 'Datasets' folder
print("Files moved to 'Datasets' folder:")
print(os.listdir('Datasets'))  # List the files in the Datasets folder


Saving spooky-author-identification.zip to spooky-author-identification (5).zip
Files moved to 'Datasets' folder:
['test.zip', 'spooky-author-identification (4).zip', 'sample_submission.zip', 'test', 'train.zip', 'spooky-author-identification (5).zip', 'train']


In [56]:
import zipfile

# Step 1: Extract train.zip
with zipfile.ZipFile('Datasets/train.zip', 'r') as zip_ref:
    zip_ref.extractall('Datasets/train')

# Step 2: Extract test.zip
with zipfile.ZipFile('Datasets/test.zip', 'r') as zip_ref:
    zip_ref.extractall('Datasets/test')

# Step 3: Check the extracted files
print("Train Files:", os.listdir('Datasets/train'))
print("Test Files:", os.listdir('Datasets/test'))

# Step 4: Create a Spark session
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover

spark = SparkSession.builder.appName("SpookyAuthorIdentification").getOrCreate()

# Step 5: Load training data into a DataFrame
try:
    train_df = spark.read.csv('Datasets/train/train.csv', header=True, inferSchema=True)
    train_df.printSchema()
    train_df.show(5)
except Exception as e:
    print("Error loading the CSV file:", e)

# Step 6: Load test data into a DataFrame
try:
    test_df = spark.read.csv('Datasets/test/test.csv', header=True, inferSchema=True)
    test_df.printSchema()
    test_df.show(5)
except Exception as e:
    print("Error loading the CSV file:", e)


Train Files: ['train.csv']
Test Files: ['test.csv']
root
 |-- id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- author: string (nullable = true)

+-------+--------------------+------+
|     id|                text|author|
+-------+--------------------+------+
|id26305|This process, how...|   EAP|
|id17569|It never once occ...|   HPL|
|id11008|In his left hand ...|   EAP|
|id27763|How lovely is spr...|   MWS|
|id12958|Finding nothing e...|   HPL|
+-------+--------------------+------+
only showing top 5 rows

root
 |-- id: string (nullable = true)
 |-- text: string (nullable = true)

+-------+--------------------+
|     id|                text|
+-------+--------------------+
|id02310|Still, as I urged...|
|id24541|If a fire wanted ...|
|id00134|And when they had...|
|id27757|While I was think...|
|id04081|I am not sure to ...|
+-------+--------------------+
only showing top 5 rows



In [36]:
#Print num rows and cols in the training set
train_row_count = train_df.count()
train_column_count = len(train_df.columns)
print(f"Training Data - Rows: {train_row_count}, Columns: {train_column_count}")

Training Data - Rows: 19579, Columns: 3


In [57]:
#Print num rows and cols in the test set
test_row_count = test_df.count()
test_column_count = len(test_df.columns)
print(f"Test Data - Rows: {test_row_count}, Columns: {test_column_count}")

Test Data - Rows: 8392, Columns: 2


In [41]:
#Check for missing values (There are none)
train_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in train_df.columns]).show()
test_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in test_df.columns]).show()

+---+----+------+
| id|text|author|
+---+----+------+
|  0|   0|     0|
+---+----+------+

+---+----+
| id|text|
+---+----+
|  0|   0|
+---+----+



In [42]:
#Create sentence length column
train_df = train_df.withColumn("sentence_length", F.length(F.col("text")))
train_df.select("sentence_length").describe().show()

+-------+------------------+
|summary|   sentence_length|
+-------+------------------+
|  count|             19579|
|   mean|139.99765054395016|
| stddev|101.25452331007808|
|    min|                 5|
|    max|              3682|
+-------+------------------+



In [43]:
#Use tokenizer to tokenize the text
tokenizer = Tokenizer(inputCol="text", outputCol="words")
tokenized_train_df = tokenizer.transform(train_df)

#Print tokenized output
tokenized_train_df.select("text", "words").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                                                                   |words                                                                                                                                                                                                                                                              

In [44]:
#Create an instance of StopWordsRemover and use it to filter the tokens and remove stop words
#The new filtered tokens are placed in a new column "filtered_words"
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
cleaned_train_df = remover.transform(tokenized_train_df)

#Print the cleaned output
cleaned_train_df.select("words", "filtered_words").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|words                                                                                                                                                                                                                                                                            |filtered_words                                                                                                                                                                              |
+---------------------------------------------------------------------

# Satge 2 Feature Extraction:
we will convert the cleaned text into numerical features using TF-IDF (Term Frequency-Inverse Document Frequency).

In [73]:
# Step 1: Install necessary libraries
!pip install --upgrade pyspark

# Step 2: Import necessary modules
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, Normalizer
from pyspark.ml import Pipeline
from google.colab import files

# Step 3: Upload your CSV file
uploaded = files.upload()

# Step 4: Initialize Spark session
spark = SparkSession.builder.appName("TextProcessing").getOrCreate()

# Step 5: Load training data from the uploaded file
train_df = spark.read.csv("train.csv", header=True, inferSchema=True)

# Step 6: Tokenization
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
tokens_df = tokenizer.transform(train_df)

# Step 7: Stop words removal
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
filtered_df = remover.transform(tokens_df)

# Step 8: TF-IDF calculation using CountVectorizer
vectorizer = CountVectorizer(inputCol="filtered_tokens", outputCol="vectorized_tokens")
idf = IDF(inputCol="vectorized_tokens", outputCol="tfidf")

# Step 9: Normalization
normalizer = Normalizer(inputCol="tfidf", outputCol="normalized_features")

# Step 10: Create and apply pipeline
pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, idf, normalizer])
processed_data = pipeline.fit(train_df).transform(train_df)

# Step 11: Show the final processed DataFrame with normalized features
processed_data.select("normalized_features").show(truncate=False)




Saving train.csv to train (5).csv
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------