In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, auc, classification_report, roc_auc_score, roc_curve
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pprint
import pandas as pd
import xgboost as xgb

In [5]:
# Labelling function
def labelling(df, col, num_bins):
    """
    Apply quantile-based binning to a column in the DataFrame.
    """
    df[col] = pd.qcut(df[col], q=num_bins, labels=False, duplicates='drop')
    return df

# Binning function for age groups
def bin_age_groups(train_df, test_df):
    """
    Bin the 'Age_y' column into defined age groups.
    """
    age_bin_edges = [0, 20, 25, 30, 35, 40, 45, 50, 55, 60, float('inf')]
    labels = ['Duoi 20', '20 toi 24', '25 toi 29', '30 toi 34', '35 toi 39', 
              '40 toi 44', '45 toi 49', '50 toi 54', '55 toi 59', 'Tren 60']

    train_df['Age_group'] = pd.cut(train_df['Age_y'], bins=age_bin_edges, labels=labels, right=False)
    test_df['Age_group'] = pd.cut(test_df['Age_y'], bins=age_bin_edges, labels=labels, right=False)
    return train_df, test_df

def preprocess_data(train_path, test_path):
    """
    Preprocess train and test datasets by aligning numeric columns, 
    binning age groups, and dropping unnecessary columns.
    """
    train = pd.read_csv(train_path)
    test = pd.read_parquet(test_path)

    # Drop specified columns
    cols_to_drop = ['Age_x', 'CIF_CLSCUS', 'COB_DATE', 'DATE_TIME', 
                    'BRN_OPN_CIF', 'MA_PHONG_GIAO_DICH_VCB', 
                    'CIF_MASK', 'IS_TM', 'Unnamed: 0', 
                    'SUM_CBALQ_LH_6m', 'SUM_CBALQ_LH_3m', 'AVG_GR_SUM_CBALQ_LH']
    train = train.drop(columns=[col for col in cols_to_drop if col in train.columns], errors='ignore')
    test = test.drop(columns=[col for col in cols_to_drop if col in test.columns], errors='ignore')

    # Bin age groups
    train, test = bin_age_groups(train, test)

    # Align numeric columns
    numeric_cols = train.select_dtypes(include=[np.number]).columns.intersection(
        test.select_dtypes(include=[np.number]).columns
    )
    train = train[numeric_cols]
    test = test[numeric_cols]

    print(f"Numeric columns aligned: {numeric_cols.tolist()}")
    return train, test



# Model functions (same as previous)
def save_model(train, model_path):
    """
    Save the trained model (normalized data and parameters) to a pickle file.
    """
    # Select only numeric columns
    numeric_cols = train.select_dtypes(include=[np.number]).columns
    X_train = train[numeric_cols].drop(columns=["IS_BANCAS"], errors='ignore').to_numpy()
    y_train = train["IS_BANCAS"].to_numpy()

    # Calculate mean and standard deviation
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0)

    # Normalize X_train
    X_train_normalized = (X_train - mean) / (std +1e-8)

    # Save the model
    model = {
        "X_train": X_train_normalized,
        "y_train": y_train,
        "mean": mean,
        "std": std
    }
    with open(model_path, "wb") as f:
        pickle.dump(model, f)
    print(f"Model saved to {model_path}")

def predict_new_observation(new_observation, model_path, top_k=20):
    """
    Load the model and predict the expected IS_BANCAS value for a new observation.
    """
    with open(model_path, "rb") as f:
        model = pickle.load(f)

    X_train = model["X_train"]
    y_train = model["y_train"]
    mean = model["mean"]
    std = model["std"]

    new_observation = (new_observation - mean) / std
    similarity = np.dot(X_train, new_observation)
    top_k_indices = np.argsort(-similarity)[:top_k]
    top_k_bancas = y_train[top_k_indices]
    predicted_bancas = round(top_k_bancas.mean())
    return predicted_bancas

def predict_first_n_observations(test, model_path, n=10, top_k=20):
    """
    Predict the "IS_BANCAS" value for the first N observations in the test dataset
    and print the actual and predicted values.
    """
    with open(model_path, "rb") as f:
        model = pickle.load(f)

    X_train = model["X_train"]
    y_train = model["y_train"]
    mean = model["mean"]
    std = model["std"]

    # Get the first N observations and their actual IS_BANCAS values
    X_test = test.drop(columns=["IS_BANCAS"]).iloc[:n].to_numpy()
    actual_values = test["IS_BANCAS"].iloc[:n].to_numpy()

    print(f"X_train shape: {X_train.shape}")
    print(f"mean shape: {mean.shape}")
    print(f"std shape: {std.shape}")
    print(f"X_test shape: {X_test.shape}")

    # Normalize test data
    X_test = (X_test - mean) / (std + 1e-8)

    predictions = []
    for i in range(X_test.shape[0]):
        # Calculate similarity and predict IS_BANCAS
        similarity = np.dot(X_train, X_test[i])
        top_k_indices = np.argsort(-similarity)[:top_k]
        top_k_bancas = y_train[top_k_indices]
        predicted_bancas = top_k_bancas.mean()
        predictions.append(predicted_bancas)

        # Print the expected and actual values
        print(f"Observation {i + 1}: Predicted IS_BANCAS = {predicted_bancas}, Actual IS_BANCAS = {actual_values[i]}")

    return predictions
    
def predict_all_observations(test, model_path, top_k=20, print_n=10):
    """
    Predict the "IS_BANCAS" value for all observations in the test dataset
    and add the predictions as a new column. Print out predictions for the first `print_n` rows.
    """
    with open(model_path, "rb") as f:
        model = pickle.load(f)

    X_train = model["X_train"]
    y_train = model["y_train"]
    mean = model["mean"]
    std = model["std"]

    # Get all observations from the test set (excluding IS_BANCAS column)
    X_test = test.drop(columns=["IS_BANCAS"]).to_numpy()

    # Normalize test data
    X_test = (X_test - mean) / (std + 1e-8)

    predictions = []
    for i in range(X_test.shape[0]):
        # Calculate similarity and predict IS_BANCAS
        similarity = np.dot(X_train, X_test[i])
        top_k_indices = np.argsort(-similarity)[:top_k]
        top_k_bancas = y_train[top_k_indices]
        predicted_bancas = top_k_bancas.modeinter()
        predictions.append(predicted_bancas)

    # Add the predictions to the test DataFrame
    test["predicted_IS_BANCAS"] = predictions

    # Optionally, print out the first few predictions for inspection
    print(f"Showing the first {min(print_n, len(test))} predictions:")
    print(test[['IS_BANCAS', 'predicted_IS_BANCAS']].head(print_n))

    return test


# File paths
train_path = "data.csv"
test_path = "data-val.parquet"
model_path = "results/collaborative_search_dot_product_model.pkl"

# Step 1: Preprocess data
print("Preprocessing data...")
train, test = preprocess_data(train_path, test_path)

# Step 2: Train and save the model
print("Training and saving the model...")
save_model(train, model_path)




Preprocessing data...
Numeric columns aligned: ['CBALQ_3m', 'AVG_SL_SP_BOSUNG', 'NO_TREN_CO_6m', 'SUM_CBALQ_LH', 'BHNT_flag', 'MEDIAN_GR_SUM_AMT', 'BHNT_after21', 'Sum_PPC', 'MEDIAN_GR_THGCO', 'BHSK_remain', 'IS_BANCAS', 'AVG_GR_CBALQ', 'CBALQ_6m', 'AVG_CBALQ_6m', 'BHNT_remain', 'AVG_GR_THGCO', 'IS_TM.1', 'Age_y', 'THGCO_3m', 'CNT_TGCCKH', 'THGNO_6m', 'IS_TA', 'TONGTHUNHAPHANGTHANG', 'Snapshot', 'BHSK_flag', 'THGCO_6m', 'MEDIAN_GR_CBALQ', 'AVG_CBALQ_TGCCKH', 'THGNO_3m', 'AVG_AMT_3M', 'NO_TREN_CO_3m', 'AVG_CBALQ_3m', 'BHSK_after21', 'Payroll_Flag', 'AVG_GR_THGNO', 'MEDIAN_GR_THGNO']
Training and saving the model...


  x = asanyarray(arr - arrmean)
  X_train_normalized = (X_train - mean) / (std +1e-8)


Model saved to results/collaborative_search_dot_product_model.pkl


In [6]:

# Step 3: Predict all observations in the test dataset and show the first 10 predictions
print("Predicting all observations in the test dataset...")
# predicted_test_data = predict_all_observations(test, model_path, top_k=20, print_n=10)
predict_first_n_observations(test, model_path, n=10, top_k=20)


Predicting all observations in the test dataset...
X_train shape: (1088451, 35)
mean shape: (35,)
std shape: (35,)
X_test shape: (1000, 35)


  X_test = (X_test - mean) / (std + 1e-8)


Observation 1: Predicted IS_BANCAS = 1, Actual IS_BANCAS = 1.0
Observation 2: Predicted IS_BANCAS = 1, Actual IS_BANCAS = 1.0
Observation 3: Predicted IS_BANCAS = 1, Actual IS_BANCAS = 1.0
Observation 4: Predicted IS_BANCAS = 1, Actual IS_BANCAS = 1.0
Observation 5: Predicted IS_BANCAS = 1, Actual IS_BANCAS = 1.0
Observation 6: Predicted IS_BANCAS = 1, Actual IS_BANCAS = 1.0
Observation 7: Predicted IS_BANCAS = 1, Actual IS_BANCAS = 1.0
Observation 8: Predicted IS_BANCAS = 1, Actual IS_BANCAS = 1.0
Observation 9: Predicted IS_BANCAS = 1, Actual IS_BANCAS = 1.0
Observation 10: Predicted IS_BANCAS = 1, Actual IS_BANCAS = 1.0
Observation 11: Predicted IS_BANCAS = 1, Actual IS_BANCAS = 1.0
Observation 12: Predicted IS_BANCAS = 1, Actual IS_BANCAS = 1.0
Observation 13: Predicted IS_BANCAS = 1, Actual IS_BANCAS = 1.0
Observation 14: Predicted IS_BANCAS = 1, Actual IS_BANCAS = 1.0
Observation 15: Predicted IS_BANCAS = 1, Actual IS_BANCAS = 1.0
Observation 16: Predicted IS_BANCAS = 1, Actual I

KeyboardInterrupt: 