# Notebook:02_preprocession.ipynb

In [41]:
import pandas as pd
import sys
import os
import swifter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [42]:
# Ensure src module can be found
sys.path.append('../src')
from data_loader import load_data
from preprocess import clean_text_parallel

In [43]:
# Load train data
train_path = "../data/raw/train_data.txt"

print(f"Loading training data from: {train_path}")
train_df = load_data(train_path, is_train=True)

Loading training data from: ../data/raw/train_data.txt
Data loaded successfully from ../data/raw/train_data.txt
Shape of dataset: (54214, 4)


In [44]:
# Apply parallelized text cleaning
train_df['Cleaned_Description'] = clean_text_parallel(train_df['Description'].tolist())

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adityakumar/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adityakumar/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adityakumar/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adityakumar/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adityakumar/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adityakumar/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adityakumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date![nltk_data]   Package punkt is already up-to-date!

[nltk_data]   Package punkt is already up-to-date![nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adityakumar/nltk_data...

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adityakumar/nltk_data...
[nltk_data] Downloading package stopwo

In [52]:
# Save cleaned text data
cleaned_path = "../data/processed/cleaned_train_data.csv"
train_df.to_csv(cleaned_path, index=False)
print(f"Cleaned data saved to: {cleaned_path}")
train_df.shape

Cleaned data saved to: ../data/processed/cleaned_train_data.csv


(54214, 5)

In [55]:
train_df['Genre'].value_counts()

Genre
drama          13613
documentary    13096
comedy          7447
short           5073
horror          2204
thriller        1591
action          1315
western         1032
reality-tv       884
family           784
adventure        775
music            731
romance          672
sci-fi           647
adult            590
crime            505
animation        498
sport            432
talk-show        391
fantasy          323
mystery          319
musical          277
biography        265
history          243
game-show        194
news             181
war              132
Name: count, dtype: int64

In [47]:
# Apply TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting vocab size for efficiency
X_tfidf = tfidf_vectorizer.fit_transform(train_df['Cleaned_Description'])

tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [49]:
tfidf_df.head()

Unnamed: 0,aaron,abandon,abandoned,abc,abducted,ability,able,aboard,abortion,abroad,...,youngster,youre,youth,youtube,youve,zealand,zero,zombie,zone,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.21397,0.0,0.0,0.0


In [50]:
# Save TF-IDF features
processed_path = "../data/processed/train_data_tfidf.csv"
tfidf_df.to_csv(processed_path, index=False)
print(f"Processed TF-IDF data saved to: {processed_path}")

Processed TF-IDF data saved to: ../data/processed/train_data_tfidf.csv


In [56]:
# Save TF-IDF vectorizer for later use
import joblib
vectorizer_path = "../models/saved_models/tfidf_vectorizer.pkl"
joblib.dump(tfidf_vectorizer, vectorizer_path)
print(f"TF-IDF vectorizer saved to: {vectorizer_path}")

TF-IDF vectorizer saved to: ../models/saved_models/tfidf_vectorizer.pkl


In [58]:
# Splitting data into train and test sets
y = train_df['Genre']  # Assuming the target column is 'Genre'
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, y, test_size=0.2, random_state=42)
train_df.shape

(54214, 5)

In [59]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(43371, 5000)
(10843, 5000)
(43371,)
(10843,)


In [60]:
# Save train-test split
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)
print("Train-test split data saved.")


Train-test split data saved.
