# Notebook: 03_feature_engineering.ipynb

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

In [3]:
# Load processed TF-IDF data
tfidf_path = "../data/processed/train_data_tfidf.csv"
print(f"Loading TF-IDF features from: {tfidf_path}")
X_tfidf = pd.read_csv(tfidf_path)

Loading TF-IDF features from: ../data/processed/train_data_tfidf.csv


In [4]:
# Load the target labels
y_train_path = "../data/processed/y_train.csv"
y_train = pd.read_csv(y_train_path)


In [5]:

print(f"TF-IDF feature shape: {X_tfidf.shape}")
print(f"Target labels shape: {y_train.shape}")


TF-IDF feature shape: (54214, 5000)
Target labels shape: (43371, 1)


In [6]:
# Load the cleaned descriptions
cleaned_path = "../data/processed/cleaned_train_data.csv"
train_df = pd.read_csv(cleaned_path)

In [7]:
train_df.shape

(54214, 5)

In [8]:
# TF-IDF Vectorization with N-grams
tfidf_ngram_vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),  # Uni-grams, Bi-grams, and Tri-grams
    max_features=5000
)

X_ngrams = tfidf_ngram_vectorizer.fit_transform(train_df['Cleaned_Description'])


In [9]:
# Convert to DataFrame
ngram_df = pd.DataFrame(X_ngrams.toarray(), columns=tfidf_ngram_vectorizer.get_feature_names_out())


In [10]:
# Save N-gram features
ngram_path = "../data/processed/train_data_ngrams.csv"
ngram_df.to_csv(ngram_path, index=False)
print(f"N-gram features saved to: {ngram_path}")


N-gram features saved to: ../data/processed/train_data_ngrams.csv


In [11]:
# Save the vectorizer
ngram_vectorizer_path = "../models/saved_models/ngram_vectorizer.pkl"
joblib.dump(tfidf_ngram_vectorizer, ngram_vectorizer_path)
print(f"N-gram vectorizer saved to: {ngram_vectorizer_path}")


N-gram vectorizer saved to: ../models/saved_models/ngram_vectorizer.pkl
