# Personalization ad_position

## Logistic Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv("cleaned_ads.csv")

In [None]:
# Drop non-informative features
df = df.drop(columns=['id', 'full_name'], errors='ignore')

In [None]:
# Define features (X) and target variable (y)
X = df.drop(columns=['ad_position'])  # Use all other features except the target
y = df['ad_position']

In [None]:
# Convert categorical variables to numerical using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

In [None]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Compute class weights for loss reweighting
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train Logistic Regression with class weights
log_reg = LogisticRegression(class_weight=class_weight_dict, max_iter=1000, solver='liblinear')
log_reg.fit(X_train_scaled, y_train)

In [None]:
# Make predictions
y_pred = log_reg.predict(X_test_scaled)
y_pred

array(['Top', 'Bottom', 'Top', 'Top', 'Top', 'Top', 'Top', 'Top', 'Top',
       'Side', 'Top', 'Side', 'Top', 'Bottom', 'Side', 'Top', 'Top',
       'Bottom', 'Bottom', 'Bottom', 'Bottom', 'Bottom', 'Bottom',
       'Bottom', 'Side', 'Top', 'Top', 'Bottom', 'Top', 'Bottom', 'Top',
       'Bottom', 'Bottom', 'Bottom', 'Top', 'Top', 'Top', 'Top', 'Top',
       'Top', 'Bottom', 'Top', 'Top', 'Top', 'Top', 'Bottom', 'Bottom',
       'Top', 'Bottom', 'Top', 'Top', 'Side', 'Side', 'Top', 'Bottom',
       'Bottom', 'Bottom', 'Top', 'Side', 'Top', 'Side', 'Top', 'Side',
       'Bottom', 'Top', 'Top', 'Side', 'Bottom', 'Top', 'Top', 'Top',
       'Side', 'Top', 'Side', 'Bottom', 'Bottom', 'Top', 'Top', 'Top',
       'Bottom', 'Top', 'Top', 'Bottom', 'Top', 'Top', 'Bottom', 'Bottom',
       'Top', 'Bottom', 'Bottom', 'Bottom', 'Top', 'Bottom', 'Top',
       'Bottom', 'Bottom', 'Top', 'Bottom', 'Bottom', 'Top', 'Bottom',
       'Side', 'Top', 'Top', 'Bottom', 'Side', 'Bottom', 'Top', 'Top',
     

In [None]:
# Display classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

      Bottom       0.56      0.68      0.61       286
        Side       0.36      0.18      0.24       222
         Top       0.52      0.61      0.56       292

    accuracy                           0.51       800
   macro avg       0.48      0.49      0.47       800
weighted avg       0.49      0.51      0.49       800



### Add Feature Selection

In [None]:
from sklearn.feature_selection import SelectFromModel

In [None]:
# Train Logistic Regression model with L1 penalty for feature selection
log_reg_l1 = LogisticRegression(class_weight=class_weight_dict, max_iter=1000, solver='liblinear', penalty='l1')
log_reg_l1.fit(X_train_scaled, y_train)

In [None]:
# Use SelectFromModel to keep only important features
selector_lg = SelectFromModel(log_reg_l1, prefit=True)
X_train_lg_selected = selector_lg.transform(X_train_scaled)
X_test_lg_selected = selector_lg.transform(X_test_scaled)

# Get selected feature names
selected_lg_features = X_train.columns[selector_lg.get_support()]
selected_lg_features

Index(['age', 'click', 'gender_Male', 'gender_Non-Binary',
       'device_type_Mobile', 'device_type_Tablet',
       'browsing_history_Entertainment', 'browsing_history_News',
       'browsing_history_Other', 'browsing_history_Shopping',
       'browsing_history_Social Media', 'time_of_day_Evening',
       'time_of_day_Morning', 'time_of_day_Night', 'age_group_25-44',
       'age_group_45-59', 'age_group_60+'],
      dtype='object')

In [None]:
# Retrain Logistic Regression with selected features
log_reg_selected = LogisticRegression(class_weight=class_weight_dict, max_iter=1000, solver='liblinear')
log_reg_selected.fit(X_train_lg_selected, y_train)

In [None]:
# Make predictions
y_pred_lg_selected = log_reg_selected.predict(X_test_lg_selected)

In [None]:
feature_importance_lg = pd.DataFrame({"Feature": X_train.columns, "Importance": np.abs(log_reg_selected.coef_).sum(axis=0)})
print(feature_importance_lg.sort_values(by="Importance", ascending=False))

                           Feature  Importance
5               device_type_Tablet    1.088856
4               device_type_Mobile    0.553741
8           browsing_history_Other    0.523143
14                 age_group_25-44    0.429916
10   browsing_history_Social Media    0.415057
3                gender_Non-Binary    0.318657
9        browsing_history_Shopping    0.291562
15                 age_group_45-59    0.288393
16                   age_group_60+    0.265599
2                      gender_Male    0.229418
6   browsing_history_Entertainment    0.210480
0                              age    0.167406
12             time_of_day_Morning    0.117160
7            browsing_history_News    0.102651
1                            click    0.085014
11             time_of_day_Evening    0.081519
13               time_of_day_Night    0.025088


In [None]:
# Display classification report for the model with selected features
report_lg_selected = classification_report(y_test, y_pred_lg_selected)
print("Classification Report:\n", report_lg_selected)

Classification Report:
               precision    recall  f1-score   support

      Bottom       0.56      0.68      0.61       286
        Side       0.36      0.18      0.24       222
         Top       0.52      0.61      0.56       292

    accuracy                           0.51       800
   macro avg       0.48      0.49      0.47       800
weighted avg       0.49      0.51      0.49       800



### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Define the hyperparameter grid for Logistic Regression
param_grid_lg = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear']  # Ensures compatibility with L1 penalty
}

# Perform Grid Search with cross-validation
grid_search_lg = GridSearchCV(
    LogisticRegression(class_weight=class_weight_dict, max_iter=1000),
    param_grid_lg,
    cv=5,
    scoring='f1_weighted',  # Optimizing for a balanced f1-score
    n_jobs=-1
)

# Fit Grid Search on the selected feature set
grid_search_lg.fit(X_train_lg_selected, y_train)

# Best parameters found
best_params_lg = grid_search_lg.best_params_
best_params_lg

{'C': 10, 'solver': 'liblinear'}

In [None]:
# Train Logistic Regression with the best hyperparameters
log_reg_tuned = LogisticRegression(class_weight=class_weight_dict, max_iter=1000, **best_params_lg)
log_reg_tuned.fit(X_train_lg_selected, y_train)

In [None]:
# Make predictions with the tuned model
y_pred_lg_tuned = log_reg_tuned.predict(X_test_lg_selected)

# Generate classification report for the tuned model
report_tuned_lg = classification_report(y_test, y_pred_lg_tuned, output_dict=True)
report_tuned_lg_df = pd.DataFrame(report_tuned_lg).transpose()

display(report_tuned_lg_df)

Unnamed: 0,precision,recall,f1-score,support
Bottom,0.558739,0.681818,0.614173,286.0
Side,0.364486,0.175676,0.237082,222.0
Top,0.517442,0.609589,0.559748,292.0
accuracy,0.515,0.515,0.515,0.515
macro avg,0.480222,0.489028,0.470335,800.0
weighted avg,0.48976,0.515,0.489665,800.0


## XGBoost

In [None]:
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [None]:
# Encode target variable (ad_position) as integer labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [None]:
# Compute class weights for loss reweighting
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# 1. Pass 'eval_metric' and 'early_stopping_rounds' in the constructor
xgb_model_optimized = XGBClassifier(
    use_label_encoder=False,
    eval_metric="mlogloss",      # <--- Set here
    early_stopping_rounds=5,     # <--- And here
    n_estimators=20,
    learning_rate=0.1,
    max_depth=3,
    verbosity=0
)

# 2. Provide 'eval_set' to .fit() for early stopping to work
xgb_model_optimized.fit(
    X_train_scaled,
    y_train,
    eval_set=[(X_test_scaled, y_test)],  # Must be present for early stopping
    verbose=False
)

In [None]:
# Make predictions
y_pred_xgb_opt = xgb_model_optimized.predict(X_test_scaled)

In [None]:
# Convert predictions back to original labels
y_pred_xgb_labels_opt = label_encoder.inverse_transform(y_pred_xgb_opt)
y_test_labels = label_encoder.inverse_transform(y_test)

In [None]:
# Generate classification report
report_xgb_opt = classification_report(y_test_labels, y_pred_xgb_labels_opt, output_dict=True)
report_xgb_df_opt = pd.DataFrame(report_xgb_opt).transpose()

In [None]:
display(report_xgb_df_opt)

Unnamed: 0,precision,recall,f1-score,support
Bottom,0.56,0.685315,0.616352,286.0
Side,0.413043,0.085586,0.141791,222.0
Top,0.497525,0.688356,0.577586,292.0
accuracy,0.52,0.52,0.52,0.52
macro avg,0.490189,0.486419,0.445243,800.0
weighted avg,0.496416,0.52,0.470512,800.0


### Add Feature Selection

In [None]:
# Train an initial XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", n_estimators=100, learning_rate=0.1, max_depth=3)
xgb_model.fit(X_train_scaled, y_train)

Parameters: { "use_label_encoder" } are not used.



In [None]:
# Feature selection using SelectFromModel (selects important features based on feature importance)
selector_xgb = SelectFromModel(xgb_model, threshold="median", prefit=True)  # Selects features above the median importance
X_train_xgb_selected = selector_xgb.transform(X_train_scaled)
X_test_xgb_selected = selector_xgb.transform(X_test_scaled)

# Get the names of selected features
selected_features_xgb = X.columns[selector_xgb.get_support()]
selected_features_xgb

Index(['click', 'device_type_Mobile', 'device_type_Tablet',
       'browsing_history_News', 'browsing_history_Other',
       'browsing_history_Social Media', 'time_of_day_Evening',
       'time_of_day_Night', 'age_group_45-59'],
      dtype='object')

In [None]:
# Retrain XGBoost model with selected features
xgb_selected = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", n_estimators=100, learning_rate=0.1, max_depth=3)
xgb_selected.fit(X_train_xgb_selected, y_train)

Parameters: { "use_label_encoder" } are not used.



In [None]:
# Make predictions with the new model
y_pred_xgb_selected = xgb_selected.predict(X_test_xgb_selected)

# Convert predictions back to original labels
y_pred_xgb_selected_labels = label_encoder.inverse_transform(y_pred_xgb_selected)
y_test_xgb_labels = label_encoder.inverse_transform(y_test)

In [None]:
# Generate classification report
report_xgb_selected = classification_report(y_test_xgb_labels, y_pred_xgb_selected_labels, output_dict=True)
report_xgb_selected_df = pd.DataFrame(report_xgb_selected).transpose()

display(report_xgb_selected_df)

Unnamed: 0,precision,recall,f1-score,support
Bottom,0.550409,0.706294,0.618683,286.0
Side,0.3125,0.045045,0.07874,222.0
Top,0.486284,0.667808,0.562771,292.0
accuracy,0.50875,0.50875,0.50875,0.50875
macro avg,0.449731,0.473049,0.420065,800.0
weighted avg,0.460984,0.50875,0.448441,800.0


### Hyperparameter Tuning

In [None]:
# Define the hyperparameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [50, 100, 200],  # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.2],  # Step size shrinkage
    'max_depth': [3, 5, 7],  # Maximum depth of a tree
    'min_child_weight': [1, 3, 5],  # Minimum sum of instance weight (hessian) needed in a child
    'subsample': [0.8, 1.0],  # Subsample ratio of training instances
    'colsample_bytree': [0.8, 1.0]  # Subsample ratio of columns when constructing each tree
}

In [None]:
# Initialize XGBoost model
xgb_model_tune = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")

# Perform Grid Search with cross-validation
grid_search_xgb = GridSearchCV(
    estimator=xgb_model_tune,
    param_grid=param_grid_xgb,
    cv=3,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=2
)

# Fit Grid Search on the selected feature set
grid_search_xgb.fit(X_train_xgb_selected, y_train)

# Best parameters found
best_params_xgb = grid_search_xgb.best_params_
best_params_xgb

Fitting 3 folds for each of 324 candidates, totalling 972 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

{'colsample_bytree': 1.0,
 'learning_rate': 0.01,
 'max_depth': 5,
 'min_child_weight': 3,
 'n_estimators': 50,
 'subsample': 1.0}

In [None]:
# Train XGBoost with the best hyperparameters
xgb_best = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", **best_params_xgb)
xgb_best.fit(X_train_xgb_selected, y_train)

# Make predictions with the tuned model
y_pred_xgb_best = xgb_best.predict(X_test_xgb_selected)

Parameters: { "use_label_encoder" } are not used.



In [None]:
# Convert predictions back to original labels
y_pred_xgb_best_labels = label_encoder.inverse_transform(y_pred_xgb_best)
y_test_labels = label_encoder.inverse_transform(y_test)

# Generate classification report for the tuned model
report_xgb_best = classification_report(y_test_labels, y_pred_xgb_best_labels, output_dict=True)
report_xgb_best_df = pd.DataFrame(report_xgb_best).transpose()

display(report_xgb_best_df)

Unnamed: 0,precision,recall,f1-score,support
Bottom,0.585443,0.646853,0.614618,286.0
Side,0.317647,0.121622,0.175896,222.0
Top,0.486216,0.664384,0.561505,292.0
accuracy,0.5075,0.5075,0.5075,0.5075
macro avg,0.463102,0.477619,0.450673,800.0
weighted avg,0.474912,0.5075,0.473486,800.0
