In [None]:
""" Import Package """
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import random
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt

""" Global variables """
DATA = "./data"
TRAIN_DATA_PATH = f"{DATA}/training.csv"
TEST_DATA_PATH = f"{DATA}/test.csv"
OUTPUT_PREDICTION = f"{DATA}/traditional_method_predictions.csv"

First, we want to find the relationthips among the features in the dataset in order to help us decide how to utilize the combination of features.

In [None]:
""" Load dataset """
train_data = pd.read_csv(TRAIN_DATA_PATH)
test_data = pd.read_csv(TEST_DATA_PATH)

features = ['query_length', 'is_homepage', 'sig1', 'sig2', 
            'sig3', 'sig4', 'sig5', 'sig6', 'sig7', 'sig8']
train_features = train_data[features]
train_labels = train_data['relevance']
test_features = test_data[features]

print(f"We have {train_features.shape[1]} features")
print(f"We have {train_features.shape[0]} total samples")

In [None]:
""" Standardize both dataset """
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

In [None]:
""" Remove the outlier inside dataset """
features_df = pd.DataFrame(train_features)
Q1 = features_df.quantile(0.25)
Q3 = features_df.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

original_sample_num = train_features.shape[0]

outlier_mask = (features_df < lower_bound) | (features_df > upper_bound)
outlier_count = outlier_mask.sum(axis=1)
outlier_threshold = train_features.shape[1] // 2
mask = outlier_count <= outlier_threshold

train_features = train_features[mask.values]
train_labels = train_labels[mask.values]
washed_sample_num = train_features.shape[0]
print(f"We remove {original_sample_num - washed_sample_num} outliers from dataset")
print(f"Currently we have {washed_sample_num} total samples")

In [None]:
""" Visualize the relationships among features """
sns.pairplot(train_data[features])
plt.show()

In [None]:
""" Apply forward stepwise feature selection """
from sklearn.preprocessing import SplineTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

all_features = ['query_length', 'is_homepage', 'sig1', 'sig2', 
                'sig3', 'sig4', 'sig5', 'sig6', 'sig7', 'sig8']
optimal_features = []
optimal_cur_features = []
optimal_pre_features = []
best_acc = 0
for idx in range(len(all_features)):
    best_cur_acc = 0
    for new_feature in all_features:
        cur_features = optimal_pre_features[:]
        if new_feature not in optimal_pre_features:
            cur_features.append(new_feature)
        else:
            continue
        random.seed(1)
        np.random.seed(1)
        train_data = pd.read_csv(TRAIN_DATA_PATH)

        cur_train_features = train_data[cur_features]
        cur_train_labels = train_data['relevance']
        
        spline_transformer = SplineTransformer(n_knots=100, degree=3)
        model = LogisticRegression(random_state=1)
        pipeline = make_pipeline(spline_transformer, model)

        kf = KFold(n_splits=10, shuffle=True, random_state=1)
        cv_scores = cross_val_score(pipeline, cur_train_features, cur_train_labels, 
                                    cv=kf, scoring='accuracy')
        
        if best_cur_acc <= np.mean(cv_scores):
            optimal_cur_features = cur_features
            best_cur_acc = np.mean(cv_scores)
    optimal_pre_features = optimal_cur_features
    if best_acc <= best_cur_acc:
        optimal_features = optimal_cur_features
        best_acc = best_cur_acc
print(f"Optimal features combination: {optimal_features} with best acc: {best_acc}")

We learn that sig5 is not in the optimal features and we decide to remove it from the dataset.

In [None]:
""" Remove sig5 from the dataset """
train_features = np.concatenate((train_features[:, 0:6], train_features[:, 7:]), axis=1)
test_features = np.concatenate((test_features[:, 0:6], test_features[:, 7:]), axis=1)

In [None]:
""" Apply kernel method to increase features """
poly = PolynomialFeatures(degree=2, include_bias=False)
train_features = poly.fit_transform(train_features)
test_features = poly.transform(test_features)

In [None]:
""" Apply PCA to remove unnecessary features """
pca = PCA()
pca.fit(train_features)
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

plt.figure(figsize=(8, 6))
plt.plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, marker='o', linestyle='--')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Cumulative Explained Variance Ratio by PCA')
plt.grid(True)
plt.show()

n_components = np.argmax(cumulative_variance_ratio >= 0.999) + 1
print(f"Number of principal components to retain (99% variance): {n_components}")

pca = PCA(n_components=n_components)
train_features = pca.fit_transform(train_features)
test_features = pca.transform(test_features)

In [None]:
""" Modeling using logistic regression  """
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, KFold

model = LogisticRegression(
    random_state=1,
)

param_grid = [
    {
        'penalty': ['l2'], 
        'C': [0.01, 0.1, 1, 10, 100], 
        'solver': ['newton-cg', 'lbfgs', 'sag'], 
        'max_iter': [100, 300, 500]
    },
    {
        'penalty': ['l1'], 
        'C': [0.01, 0.1, 1, 10, 100], 
        'solver': ['liblinear'], 
        'max_iter': [100, 300, 500]
    },
    {
        'penalty': ['elasticnet'], 
        'C': [0.01, 0.1, 1, 10, 100], 
        'solver': ['saga'], 
        'l1_ratio': [0.2, 0.5, 0.8],  # Adding l1_ratio for elasticnet
        'max_iter': [100, 300, 500]
    },
    {
        'penalty': [None], 
        'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'], 
        'max_iter': [100, 300, 500]
    }
]

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=KFold(n_splits=5, shuffle=True, random_state=1), 
    verbose=1,
    n_jobs=-1
)

grid_search.fit(train_features, train_labels)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: ", grid_search.best_score_)

best_model = grid_search.best_estimator_
best_model.fit(train_features, train_labels)

In [None]:
""" Modeling using RandomForest Classifier  """
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold

model = RandomForestClassifier(
    random_state=1,
)

param_grid = {
    'n_estimators': [50, 100, 300],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt'],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=KFold(n_splits=5, shuffle=True, random_state=1), 
    verbose=1,
    n_jobs=-1
)

grid_search.fit(train_features, train_labels)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: ", grid_search.best_score_)

best_model = grid_search.best_estimator_
best_model.fit(train_features, train_labels)

In [None]:
""" Modeling using Light Gradient Boosting  """
import lightgbm as lgb
from sklearn.model_selection import KFold, GridSearchCV

model = lgb.LGBMClassifier(
    random_state=1,
    verbosity=-1,
)

param_grid = {
    'num_leaves': [20, 31, 40],
    'max_depth': [-1],
    'learning_rate': [0.01, 0.1, 0.5],
    'n_estimators': [50, 100, 150],
    'min_child_samples': [10, 20],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=KFold(n_splits=5, shuffle=True, random_state=1), 
    verbose=1,
    n_jobs=-1
)

grid_search.fit(train_features, train_labels)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: ", grid_search.best_score_)

best_model = grid_search.best_estimator_
best_model.fit(train_features, train_labels)

In [None]:
""" Predict on test dataset using the best model """
results = []
predictions = best_model.predict(test_features)
for idx, prediction in enumerate(predictions):
    sample_id = str(int(test_data.iloc[idx]['query_id'])) + str(int(test_data.iloc[idx]['url_id']))
    results.append({'id': sample_id, 'relevance': int(prediction)})

results_df = pd.DataFrame(results)
results_df.to_csv(OUTPUT_PREDICTION, index=False)
print(f"Predictions saved at {OUTPUT_PREDICTION}")