# Notebook:02_preprocession.ipynb

In [1]:
import pandas as pd
import sys
import os
import swifter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


In [2]:
# Ensure src module can be found
sys.path.append('../src')
from data_loader import load_data
from preprocess import clean_text_parallel


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adityakumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adityakumar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adityakumar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Load train data
train_path = "../data/raw/train_data.txt"

print(f"Loading training data from: {train_path}")
train_df = load_data(train_path, is_train=True)


Loading training data from: ../data/raw/train_data.txt
Data loaded successfully from ../data/raw/train_data.txt
Shape of dataset: (54214, 4)


In [4]:
# Apply parallelized text cleaning
cleaned_df = clean_text_parallel(train_df,'Description','Cleaned_Description')
cleaned_df.shape


Pandas Apply:   0%|          | 0/54214 [00:00<?, ?it/s]

(54214, 5)

In [5]:
cleaned_df.head()


Unnamed: 0,ID,Title,Genre,Description,Cleaned_Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...,listening conversation doctor parent yearold o...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...,brother sister past incestuous relationship cu...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...,bus empty student field trip museum natural hi...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...,help unemployed father make end meet edith twi...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...,film title refers unrecovered body ground zero...


In [6]:
# Save cleaned text data
cleaned_path = "../data/processed/cleaned_train_data.csv"
cleaned_df.to_csv(cleaned_path, index=False)
print(f"Cleaned data saved to: {cleaned_path}")
cleaned_df.shape


Cleaned data saved to: ../data/processed/cleaned_train_data.csv


(54214, 5)

In [7]:
cleaned_df['Genre'].value_counts()


Genre
drama          13613
documentary    13096
comedy          7447
short           5073
horror          2204
thriller        1591
action          1315
western         1032
reality-tv       884
family           784
adventure        775
music            731
romance          672
sci-fi           647
adult            590
crime            505
animation        498
sport            432
talk-show        391
fantasy          323
mystery          319
musical          277
biography        265
history          243
game-show        194
news             181
war              132
Name: count, dtype: int64

In [8]:
# Apply TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting vocab size for efficiency
X_tfidf = tfidf_vectorizer.fit_transform(cleaned_df['Cleaned_Description'])

tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


In [9]:
tfidf_df.head()


Unnamed: 0,aaron,abandon,abandoned,abc,abducted,ability,able,aboard,abortion,abroad,...,youngster,youre,youth,youtube,youve,zealand,zero,zombie,zone,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.21397,0.0,0.0,0.0


In [10]:
# Save TF-IDF features
processed_path = "../data/processed/train_data_tfidf.csv"
tfidf_df.to_csv(processed_path, index=False)
print(f"Processed TF-IDF data saved to: {processed_path}")


Processed TF-IDF data saved to: ../data/processed/train_data_tfidf.csv


In [11]:
# Save TF-IDF vectorizer for later use
import joblib
vectorizer_path = "../models/tfidf_vectorizer.pkl"
joblib.dump(tfidf_vectorizer, vectorizer_path)
print(f"TF-IDF vectorizer saved to: {vectorizer_path}")


TF-IDF vectorizer saved to: ../models/tfidf_vectorizer.pkl


In [12]:
# Splitting data into train and test sets
y = train_df['Genre']  # Assuming the target column is 'Genre'
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, y, test_size=0.2, random_state=42)



In [13]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(43371, 5000)
(10843, 5000)
(43371,)
(10843,)


In [14]:
# Save train-test split
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)
print("Train-test split data saved.")


Train-test split data saved.
