# Notebook: 03_feature_engineering.ipynb

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib


In [2]:
# Load processed TF-IDF data
tfidf_path = "../data/processed/train_data_tfidf.csv"
print(f"Loading TF-IDF features from: {tfidf_path}")
X_tfidf = pd.read_csv(tfidf_path)


Loading TF-IDF features from: ../data/processed/train_data_tfidf.csv


In [3]:
X_tfidf.head()


Unnamed: 0,aaron,abandon,abandoned,abc,abducted,ability,able,aboard,abortion,abroad,...,youngster,youre,youth,youtube,youve,zealand,zero,zombie,zone,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.21397,0.0,0.0,0.0


In [None]:
# Load the target labels
y_train_path = "../data/processed/y_train.csv"
y_train = pd.read_csv(y_train_path)


In [6]:
y_train.head()


Unnamed: 0,Genre
0,drama
1,drama
2,documentary
3,documentary
4,comedy


In [5]:

print(f"TF-IDF feature shape: {X_tfidf.shape}")
print(f"Target labels shape: {y_train.shape}")


TF-IDF feature shape: (54214, 5000)
Target labels shape: (43371, 1)


In [8]:
# Load the cleaned descriptions
cleaned_path = "../data/processed/cleaned_train_data.csv"
train_df = pd.read_csv(cleaned_path)


In [9]:
train_df.shape


(54214, 5)

In [10]:
train_df.head()


Unnamed: 0,ID,Title,Genre,Description,Cleaned_Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...,listening conversation doctor parent yearold o...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...,brother sister past incestuous relationship cu...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...,bus empty student field trip museum natural hi...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...,help unemployed father make end meet edith twi...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...,film title refers unrecovered body ground zero...


In [11]:
# TF-IDF Vectorization with N-grams
tfidf_ngram_vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),  # Uni-grams, Bi-grams, and Tri-grams
    max_features=5000
)

X_ngrams = tfidf_ngram_vectorizer.fit_transform(train_df['Cleaned_Description'])


In [13]:
# Convert to DataFrame
ngram_df = pd.DataFrame(X_ngrams.toarray(), columns=tfidf_ngram_vectorizer.get_feature_names_out())


In [14]:
ngram_df.head()


Unnamed: 0,aaron,abandon,abandoned,abducted,ability,able,aboard,abortion,abroad,absence,...,youngster,youre,youth,youtube,youve,zealand,zero,zombie,zone,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.210697,0.0,0.0,0.0


In [15]:
# Save N-gram features
ngram_path = "../data/processed/train_data_ngrams.csv"
ngram_df.to_csv(ngram_path, index=False)
print(f"N-gram features saved to: {ngram_path}")


N-gram features saved to: ../data/processed/train_data_ngrams.csv


In [16]:
# Save the vectorizer
ngram_vectorizer_path = "../models/ngram_vectorizer.pkl"
joblib.dump(tfidf_ngram_vectorizer, ngram_vectorizer_path)
print(f"N-gram vectorizer saved to: {ngram_vectorizer_path}")


N-gram vectorizer saved to: ../models/ngram_vectorizer.pkl
