In [None]:
""" Import Package """
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import random
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.preprocessing import SplineTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import warnings

warnings.filterwarnings('ignore')

""" Global variables """
DATA = "./data"
TRAIN_DATA_PATH = f"{DATA}/training.csv"
TEST_DATA_PATH = f"{DATA}/test.csv"
OUTPUT_PREDICTION = f"{DATA}/traditional_method_predictions.csv"
REMOVE_OUTLIER = True
STANDARDIZE_DATA = False
APPLY_KERNEL_METHOD = False
FEATURE_SELECTION = "Forward_stepwise_selection"

In [None]:
""" Load dataset """
train_data = pd.read_csv(TRAIN_DATA_PATH)
test_data = pd.read_csv(TEST_DATA_PATH)

used_features = ['query_length', 'is_homepage', 'sig1', 'sig2', 
                 'sig3', 'sig4', 'sig5', 'sig6', 'sig7', 'sig8']
unused_features = ['query_id', 'url_id', 'relevance', 'id']
detect_features = ['sig1', 'sig2', 'sig3', 
                   'sig5', 'sig6', 'sig7', 'sig8']
label = 'relevance'

In [None]:
""" Remove outlier from training dataset """
if REMOVE_OUTLIER:
    original_sample_num = train_data.shape[0]

    mask = pd.Series([True] * train_data.shape[0], index=train_data.index)

    for feature in detect_features:
        if feature in train_data:
            Q1 = train_data[feature].quantile(0.25)
            Q3 = train_data[feature].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            feature_mask = (train_data[feature] >= lower_bound) & (train_data[feature] <= upper_bound)
            outliers_num = (~feature_mask).sum()

            print(feature_mask.sum())
            print(f"Feature '{feature}': {outliers_num} outliers")

            mask &= feature_mask

    train_data = train_data[mask]

    washed_sample_num = train_data.shape[0]
    print(f"Removed {original_sample_num - washed_sample_num} outliers from dataset")
    print(f"Remaining samples: {washed_sample_num}")

In [None]:
""" Standardize both dataset """
if STANDARDIZE_DATA:
    scaler = StandardScaler()
    train_data[used_features] = scaler.fit_transform(train_data[used_features])
    test_data[used_features] = scaler.transform(test_data[used_features])

In [None]:
""" Apply kernel methods to increase available features without dropping original features """
if APPLY_KERNEL_METHOD:
    poly = PolynomialFeatures(degree=2, include_bias=False)

    train_poly = pd.DataFrame(poly.fit_transform(train_data[used_features]), index=train_data.index)
    test_poly = pd.DataFrame(poly.transform(test_data[used_features]), index=test_data.index)

    train_poly.columns = [f"poly_{i}" for i in range(train_poly.shape[1])]
    test_poly.columns = [f"poly_{i}" for i in range(test_poly.shape[1])]

    original_features = train_data[used_features].shape[1]
    extended_features = train_poly.shape[1]

    train_data = pd.concat([train_data, train_poly], axis=1)
    test_data = pd.concat([test_data, test_poly], axis=1)

    print(f"Added {extended_features - original_features} features")
    print(f"Total features after expansion: {train_data.shape[1]}")

In [None]:
""" Apply forward stepwise feature selection """
if FEATURE_SELECTION == "Forward_stepwise_selection":
    all_features = list(train_data.columns)
    print(all_features)
    for unused_feature in unused_features:
        all_features.remove(unused_feature)
    print(all_features)

    optimal_features = []
    optimal_cur_features = []
    optimal_pre_features = []
    best_acc = 0
    for idx in range(len(all_features)):
        print(f"Starting Step {idx} ...")
        best_cur_acc = 0
        for new_feature in all_features:
            cur_features = optimal_pre_features[:]
            if new_feature not in optimal_pre_features:
                cur_features.append(new_feature)
            else:
                continue
            random.seed(1)
            np.random.seed(1)

            cur_train_features = train_data[cur_features]
            cur_train_label = train_data[label]
            
            spline_transformer = SplineTransformer(n_knots=100, degree=3)
            model = LogisticRegression(random_state=1)
            pipeline = make_pipeline(spline_transformer, model)

            kf = KFold(n_splits=10, shuffle=True, random_state=1)
            cv_scores = cross_val_score(pipeline, cur_train_features, cur_train_label, 
                                        cv=kf, scoring='accuracy')
            
            if best_cur_acc <= np.mean(cv_scores):
                optimal_cur_features = cur_features
                best_cur_acc = np.mean(cv_scores)
                print(f"Update acc: {best_cur_acc} with features: {optimal_cur_features}")

        optimal_pre_features = optimal_cur_features
        if best_acc <= best_cur_acc:
            optimal_features = optimal_cur_features
            best_acc = best_cur_acc
    print(f"Optimal features combination: {optimal_features} with best acc: {best_acc}")
    train_features = train_data[optimal_features]
    train_label = train_data[label]
    test_features = test_data[optimal_features]

In [None]:
""" Apply PCA to remove unnecessary features """
if FEATURE_SELECTION == "PCA":
    pca = PCA()
    pca.fit(train_data[used_features])
    explained_variance_ratio = pca.explained_variance_ratio_
    cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

    plt.figure(figsize=(8, 6))
    plt.plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, marker='o', linestyle='--')
    plt.xlabel('Number of Principal Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.title('Cumulative Explained Variance Ratio by PCA')
    plt.grid(True)
    plt.show()

    n_components = np.argmax(cumulative_variance_ratio >= 0.999) + 1
    print(f"Number of principal components to retain (99% variance): {n_components}")

    pca = PCA(n_components=n_components)
    train_features = pca.fit_transform(train_data[used_features])
    train_label = train_data[label]
    test_features = pca.transform(test_data[used_features])

In [None]:
""" Visualize the relationships among features """
sns.pairplot(pd.DataFrame(train_features))
plt.show()

In [None]:
""" Modeling using logistic regression  """
model = LogisticRegression(
    random_state=1,
)

param_grid = [
    {
        'penalty': ['l2'], 
        'C': [0.1, 1, 10], 
        'solver': ['newton-cg', 'lbfgs', 'sag'], 
        'max_iter': [500]
    },
    {
        'penalty': ['l1'], 
        'C': [0.1, 1, 10], 
        'solver': ['liblinear'], 
        'max_iter': [500]
    }
]

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=KFold(n_splits=5, shuffle=True, random_state=1), 
    verbose=1,
    n_jobs=-1
)

grid_search.fit(train_features, train_label)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: ", grid_search.best_score_)

best_model = grid_search.best_estimator_
best_model.fit(train_features, train_label)

In [None]:
""" Modeling using RandomForest Classifier  """
model = RandomForestClassifier(
    random_state=1,
)

param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [5, 10],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt'],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=KFold(n_splits=5, shuffle=True, random_state=1), 
    verbose=1,
    n_jobs=-1
)

grid_search.fit(train_features, train_label)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: ", grid_search.best_score_)

best_model = grid_search.best_estimator_
best_model.fit(train_features, train_label)

In [None]:
""" Modeling using Light Gradient Boosting  """
model = lgb.LGBMClassifier(
    random_state=1,
    verbosity=-1,
)

param_grid = {
    'num_leaves': [20, 30, 40],
    'max_depth': [-1],
    'learning_rate': [0.1, 1],
    'n_estimators': [50, 100, 150],
    'min_child_samples': [20],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=KFold(n_splits=5, shuffle=True, random_state=1), 
    verbose=1,
    n_jobs=-1
)

grid_search.fit(train_features, train_label)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: ", grid_search.best_score_)

best_model = grid_search.best_estimator_
best_model.fit(train_features, train_label)

In [None]:
""" Predict on test dataset using the best model """
results = []
predictions = best_model.predict(test_features)
test_data = pd.read_csv(TEST_DATA_PATH)
for idx, prediction in enumerate(predictions):
    sample_id = str(int(test_data.iloc[idx]['query_id'])) + str(int(test_data.iloc[idx]['url_id']))
    results.append({'id': sample_id, 'relevance': int(prediction)})

results_df = pd.DataFrame(results)
results_df.to_csv(OUTPUT_PREDICTION, index=False)
print(f"Predictions saved at {OUTPUT_PREDICTION}")