# Finding and filtering more significant features

In [14]:
# we conduct forward selection using sklearn on our saved dataset to find the most significant features for predicting clickability.

# --- DATA MANIPULATION ---
import pandas as pd
import numpy as np

# --- STATS ---
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor

# --- SYSTEM ---
import sys
import warnings
warnings.filterwarnings("ignore")
sys.path.append('..')
RANDOM_SEED = np.random.randint(0, 10000)

# --- SET RANDOM SEED ---
np.random.seed(RANDOM_SEED)

# --- PATHS ---
from pathlib import Path
base = Path.cwd().parent
processed_path = base / "data" / "processed"

# --- Load datasets ---
X_structured = pd.read_parquet(processed_path / "youtube_features_structured.parquet")
X_text = pd.read_parquet(processed_path / "youtube_features_text.parquet")
X_image = pd.read_parquet(processed_path / "youtube_features_image.parquet")
y_reg = pd.read_parquet(processed_path / "youtube_target_regression.parquet")["views_per_subscriber"]
y_clf = pd.read_parquet(processed_path / "youtube_target_classification.parquet")["high_clickability"]

print("Structured:", X_structured.shape)
print("Text:", X_text.shape)
print("Image:", X_image.shape)
print("Regression target:", y_reg.shape)
print("Classification target:", y_clf.shape)

Structured: (5742, 9)
Text: (5742, 50)
Image: (5742, 50)
Regression target: (5742,)
Classification target: (5742,)


# Feature Selection for Text Features

In [15]:
# --- ML / NLP --- for text features
# perform feature selection using forward selection on text features for regression task as well as classification task

model = RandomForestRegressor(n_estimators=20, random_state=RANDOM_SEED)
sfs = SequentialFeatureSelector(model, n_features_to_select=10, direction='forward', scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
sfs.fit(X_text, y_reg)
selected_features_text = X_text.columns[sfs.get_support()]
print("Selected text features for regression task:")
print(selected_features_text)

sfs_clf = SequentialFeatureSelector(model, n_features_to_select=10, direction='forward', scoring='accuracy', cv=5, n_jobs=-1)
sfs_clf.fit(X_text, y_clf)
selected_features_text_clf = X_text.columns[sfs_clf.get_support()]
print("Selected text features for classification task:")
print(selected_features_text_clf)

Selected text features for regression task:
Index(['tfidf_comp_2', 'tfidf_comp_7', 'tfidf_comp_9', 'tfidf_comp_12',
       'tfidf_comp_13', 'tfidf_comp_22', 'tfidf_comp_34', 'tfidf_comp_37',
       'tfidf_comp_40', 'tfidf_comp_50'],
      dtype='object')
Selected text features for classification task:
Index(['tfidf_comp_1', 'tfidf_comp_2', 'tfidf_comp_3', 'tfidf_comp_4',
       'tfidf_comp_5', 'tfidf_comp_6', 'tfidf_comp_7', 'tfidf_comp_8',
       'tfidf_comp_9', 'tfidf_comp_10'],
      dtype='object')


# Feature Selection for Image Features

In [17]:
# --- ML / NLP --- for image features
# perform feature selection using forward selection on image features for regression task as well as classification task

model = RandomForestRegressor(n_estimators=20, random_state=RANDOM_SEED)
sfs = SequentialFeatureSelector(model, n_features_to_select=10, direction='forward', scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
sfs.fit(X_image, y_reg)
selected_features_image = X_image.columns[sfs.get_support()]
print("Selected image features for regression task:")
print(selected_features_image)

sfs_clf = SequentialFeatureSelector(model, n_features_to_select=10, direction='forward', scoring='accuracy', cv=5, n_jobs=-1)
sfs_clf.fit(X_image, y_clf)
selected_features_image_clf = X_image.columns[sfs_clf.get_support()]
print("Selected image features for classification task:")
print(selected_features_image_clf)

Selected image features for regression task:
Index(['pca_3', 'pca_6', 'pca_16', 'pca_17', 'pca_20', 'pca_30', 'pca_33',
       'pca_42', 'pca_45', 'pca_49'],
      dtype='object')
Selected image features for classification task:
Index(['pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7', 'pca_8',
       'pca_9', 'pca_10'],
      dtype='object')


# Feature Selection for Structured Features

In [19]:
# --- ML / NLP --- for text features
# perform feature selection using forward selection on text features for regression task as well as classification task

model = RandomForestRegressor(n_estimators=20, random_state=RANDOM_SEED)
sfs = SequentialFeatureSelector(model, n_features_to_select=5, direction='forward', scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
sfs.fit(X_structured, y_reg)
selected_features_structured = X_structured.columns[sfs.get_support()]
print("Selected structured features for regression task:")
print(selected_features_structured)

sfs_clf = SequentialFeatureSelector(model, n_features_to_select=5, direction='forward', scoring='accuracy', cv=5, n_jobs=-1)
sfs_clf.fit(X_structured, y_clf)
selected_features_structured_clf = X_structured.columns[sfs_clf.get_support()]
print("Selected structured features for classification task:")
print(selected_features_structured_clf)

Selected structured features for regression task:
Index(['title_length', 'has_question', 'has_exclamation', 'sentiment_vader',
       'subscribers'],
      dtype='object')
Selected structured features for classification task:
Index(['title_length', 'word_count', 'caps_ratio', 'has_question',
       'has_exclamation'],
      dtype='object')


# Making a New Combined Datasets with Selected Features

In [21]:
# Generate combined feature set from the selected features for regression task
X_combined = pd.concat([X_structured[selected_features_structured].reset_index(drop=True),
                        X_text[selected_features_text].reset_index(drop=True),
                        X_image[selected_features_image].reset_index(drop=True)], axis=1)
print("Combined feature matrix:", X_combined)

# Generate combined feature set from the selected features for classification task
X_combined_clf = pd.concat([X_structured[selected_features_structured_clf].reset_index(drop=True),
                             X_text[selected_features_text_clf].reset_index(drop=True),
                             X_image[selected_features_image_clf].reset_index(drop=True)], axis=1)
print("Combined feature matrix for classification task:", X_combined_clf)

Combined feature matrix:       title_length  has_question  has_exclamation  sentiment_vader  \
0               74             0                0           0.5719   
1               75             0                0          -0.3089   
2               53             0                0          -0.5994   
3               51             0                1           0.5147   
4               30             0                0           0.5267   
...            ...           ...              ...              ...   
5737            29             0                0           0.0000   
5738            47             0                0           0.0000   
5739            57             0                0           0.0000   
5740            52             0                0           0.0000   
5741            38             0                0           0.0000   

       subscribers  tfidf_comp_2  tfidf_comp_7  tfidf_comp_9  tfidf_comp_12  \
0     2.376002e+07      0.000299      0.006493      0.0