# Feature Analysis

In [None]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from feature_engine.outliers import Winsorizer
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Create feature data directory

feature_data_dir = r'your\path\here\features'
os.makedirs(feature_data_dir, exist_ok=True)

In [None]:
# TF-IDF Vectorization for Title

tfidf_vectorizer = TfidfVectorizer(max_features=3000)  # we can play around with this. This was an arbitrary value
train_title_features = tfidf_vectorizer.fit_transform(df_train['Title'])
test_title_features = tfidf_vectorizer.transform(df_test['Title'])

In [None]:
# look at one of the matrices the vectorizer produces

# print(df_train['Title'][98])
# print(train_title_features.toarray()[98]) 

In [None]:
# Print our features

features = tfidf_vectorizer.get_feature_names_out()
print(tfidf_vectorizer.vocabulary_, end=' ')

In [None]:
# Confirm feature number

print(len(features))

In [None]:
#get TF IDF values as a dataframe

tfidf_df = pd.DataFrame(train_title_features.toarray())

In [None]:
# Transform matrix to array, flatten, and removes zeros

tfidf_df = train_title_features.toarray().flatten()
tfidf_df = tfidf_df[tfidf_df != 0]

In [None]:
# Distribution of non-zero TF-IDF Scores

sns.histplot(tfidf_df, bins=10, kde=True)
plt.xlabel("TF-IDF Score")
plt.ylabel("Number of Words")
plt.title("Distribution of TF-IDF Scores in the Corpus")
plt.xticks(rotation=45)  # Optional: Rotate x-axis labels for long feature names
plt.show()

In [None]:
# Plot boxplot to find skewness

sns.boxplot(df_train['Title Length'], orient='h')
plt.xlabel("Title Length")
plt.title("Boxplot of Title Length")  # Optional: Rotate x-axis labels for long feature names
plt.show()

Note that our TF IDF distribution is right skew

In [None]:
# Use the Winsorizer to handle outliers in the right tail

capper = Winsorizer(capping_method='gaussian', tail='right', fold=2)
capper.fit(df_train)

In [None]:
# Check where the right tail will be capped

capper.right_tail_caps_

In [None]:
# Transform both train and validation data frames

train_t = capper.transform(df_train)
test_t = capper.transform(df_val)

In [None]:
# Check to see if the transform worked appropriately

sns.boxplot(train_t['Title Length'], orient='h')
plt.xlabel("Title Length")
plt.title("Boxplot of Title Length")  # Optional: Rotate x-axis labels for long feature names
plt.show()

Note that there are no outliers present in the dataset