In [None]:
import faiss
import numpy as np
import dill
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, auc
from concurrent.futures import ThreadPoolExecutor

train_df = pd.read_csv('data.csv')
test_df = pd.read_parquet('data-val.parquet')

# Ensure the datasets have the same columns
common_columns = train_df.columns.intersection(test_df.columns)
removed_columns = set(train_df.columns).symmetric_difference(test_df.columns)
train_df_filtered = train_df[common_columns]
test_df_filtered = test_df[common_columns]

# Dropping irrelevant columns
drop_columns = ['Age_x', 'CIF_CLSCUS', 'COB_DATE', 'DATE_TIME', 'BRN_OPN_CIF', 'MA_PHONG_GIAO_DICH_VCB', 
                'CIF_MASK', 'IS_TM', 'Unnamed: 0', 'SUM_CBALQ_LH_6m', 'SUM_CBALQ_LH_3m', 'AVG_GR_SUM_CBALQ_LH']
train_df_filtered = train_df_filtered.drop(columns=drop_columns)
test_df_filtered = test_df_filtered.drop(columns=drop_columns)

# Step 1: Handle missing values and ensure consistent data types
def handle_missing_values(df):
    df = df.astype(str)
    df = df.fillna("None")
    return df

# Step 2: Preprocess - Encoding categorical variables
def preprocess_data(df, target_column, transformer=None):
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    # Reset indices for alignment with FAISS
    X = X.reset_index(drop=True)
    y = y.reset_index(drop=True)
    
    # Handle missing values and ensure consistent data types
    X = handle_missing_values(X)

    if transformer is None:
        transformer = ColumnTransformer(
            transformers=[
                ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), X.columns.tolist())
            ],
            remainder='passthrough'
        )
        transformer.fit(X)
    
    # Transform features
    X_transformed = transformer.transform(X)
    return X_transformed.astype(np.float32), y.astype(float), transformer

# Step 3: Create FAISS Index with Product Quantization
def create_faiss_index(df, target_column, n_clusters=256):
    X, y, transformer = preprocess_data(df, target_column)

    # Step 3.1: Apply Product Quantization (PQ) for dimensionality reduction
    # Number of clusters and number of subquantizers
    d = X.shape[1]  # dimensionality of the data
    nlist = 100  # Number of coarse centroids (use a large enough value for large datasets)

    # Create FAISS index with product quantization (IndexIVFPQ)
    quantizer = faiss.IndexFlatL2(d)  # Used for coarse quantization
    faiss_index = faiss.IndexIVFPQ(quantizer, d, nlist, n_clusters, 8)  # Use 8 bits per subquantizer

    # Train the index (on a subset of the data)
    faiss_index.train(X)
    
    # Add the vectors to the index
    faiss_index.add(X)
    
    return faiss_index, X, y, transformer

# Step 4: Predict using FAISS with PQ
def predict_faiss_parallel(df, target_column, faiss_index, transformer, y_train, n_neighbors=20, n_jobs=4):
    X, y, _ = preprocess_data(df, target_column, transformer)

    # Function to process a single observation
    def process_vector(test_vector):
        D, I = faiss_index.search(np.array([test_vector]), n_neighbors)  # Find nearest neighbors
        similar_train_data = y_train.iloc[I[0]]  # Use y_train directly
        return similar_train_data.mean()  # Average for prediction
    
    # Parallelize using ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=n_jobs) as executor:
        y_pred = list(executor.map(process_vector, X))
    
    return np.array(y_pred)

# Step 5: Train the model
target_column = 'IS_BANCAS'
faiss_index, _, y_train, transformer = create_faiss_index(train_df, target_column)

# Save the FAISS index
faiss.write_index(faiss_index, 'results/faiss_index.index')

# Save the transformer, y_train, and prediction function using dill
with open('results/collaborative_search_model_with_pq.pkl', 'wb') as f:
    dill.dump({
        'transformer': transformer,
        'y_train': y_train,
        'predict_faiss_parallel': predict_faiss_parallel  # Save the function
    }, f)

# Step 6: Testing

# On training data
y_pred_col_train = predict_faiss_parallel(train_df, target_column, faiss_index, transformer, y_train)
y_pred_col_train_round = y_pred_col_train.round()

# On testing data
y_pred_col_test = predict_faiss_parallel(test_df, target_column, faiss_index, transformer, y_train)
y_pred_col_test_round = y_pred_col_test.round()
y_test = test_df[target_column]

# Reporting

# On training data
print('\nTESTING ON TRAINING DATA:\n')

accuracy_col_train = accuracy_score(y_train, y_pred_col_train_round)
accuracy_col_train = round(accuracy_col_train, 4)
roc_auc_score_col_train = roc_auc_score(y_train, y_pred_col_train)
gini_col_train = 2 * roc_auc_score_col_train - 1

print('Model Accuracy:', str(accuracy_col_train * 100))
print('\nClassification Report:')
print(classification_report(y_train, y_pred_col_train_round))
print("ROC AUC Score:", roc_auc_score_col_train.round(2))
print("Gini Index:", gini_col_train.round(2))

# On testing data
print('\nTESTING ON TESTING DATA:\n')

accuracy_col_test = accuracy_score(y_test, y_pred_col_test_round)
accuracy_col_test = round(accuracy_col_test, 4)
roc_auc_score_col_test = roc_auc_score(y_test, y_pred_col_test)
gini_col_test = 2 * roc_auc_score_col_test - 1

print('Model Accuracy:', str(accuracy_col_test * 100) + '%')
print('\nClassification Report:')
print(classification_report(y_test, y_pred_col_test_round))
print("ROC AUC Score:", roc_auc_score_col_test.round(2))
print("Gini Index:", gini_col_test.round(2))

# ROC Curves

# Train data
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_pred_col_train)
roc_auc_train = auc(fpr_train, tpr_train)
fpr_train = [0] + list(fpr_train)
tpr_train = [0] + list(tpr_train)

# Test data
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_pred_col_test)
roc_auc_test = auc(fpr_test, tpr_test)
fpr_test = [0] + list(fpr_test)
tpr_test = [0] + list(tpr_test)

# Colours
background_color = '#181926'
text_colour = "#cad3f5"
axis_colour = "#b8c0e0"
guess_colour = '#8aadf4'
roc_train_colour = "#f0c6c6"
roc_test_colour = "#91d7e3"

# Setting up the plot
plt.figure(figsize=(10, 8), facecolor=background_color)
ax = plt.gca()  # Get the current Axes
ax.set_facecolor(background_color)  # Set the background color of the Axes

# Customize axis colors
ax.tick_params(axis='x', colors=axis_colour)  # Set x-axis tick color
ax.tick_params(axis='y', colors=axis_colour)  # Set y-axis tick color
ax.spines['bottom'].set_color(axis_colour)  # Set bottom spine color
ax.spines['left'].set_color(axis_colour)    # Set left spine color
ax.spines['top'].set_color(axis_colour)  # Set bottom spine color
ax.spines['right'].set_color(axis_colour)    # Set left spine color

# Plotting

# Plot the ROC curve
plt.plot(fpr_train, tpr_train, color=roc_train_colour, label=f'Train Data ROC Curve (AUC = {roc_auc_train:.2f})')
plt.plot(fpr_test, tpr_test, color=roc_test_colour, label=f'Test data ROC Curve (AUC = {roc_auc_test:.2f})')
plt.plot([0, 1], [0, 1], color=guess_colour, linestyle='--', label='Random Guess')
plt.xlabel('False Positive Rate', color=text_colour)  # White text for better contrast
plt.ylabel('True Positive Rate', color=text_colour)  # White text for better contrast
plt.title('Next Best Action Receiver Operating Characteristic (ROC) Curve', color=text_colour)
plt.legend(loc='lower right', facecolor=background_color, edgecolor=text_colour, labelcolor=text_colour)
plt.grid(alpha=0.1, color=axis_colour)  # Adjust grid line color for visibility

# Set the x and y limits to start at 0
plt.xlim(0, 1)  # x-axis starts at 0
plt.ylim(0, 1)  # y-axis starts at 0

plt.savefig('results/NBA_ROC.png')
plt.show()

In [None]:
import numpy as np
import dill
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, auc
from sklearn.neighbors import NearestNeighbors
from datasketch import MinHash, MinHashLSH
from concurrent.futures import ThreadPoolExecutor

train_df = pd.read_csv('data.csv')
test_df = pd.read_parquet('data-val.parquet')

# Ensure the datasets have the same columns
common_columns = train_df.columns.intersection(test_df.columns)
removed_columns = set(train_df.columns).symmetric_difference(test_df.columns)
train_df_filtered = train_df[common_columns]
test_df_filtered = test_df[common_columns]

# Dropping irrelevant columns
drop_columns = ['Age_x', 'CIF_CLSCUS', 'COB_DATE', 'DATE_TIME', 'BRN_OPN_CIF', 'MA_PHONG_GIAO_DICH_VCB', 
                'CIF_MASK', 'IS_TM', 'Unnamed: 0', 'SUM_CBALQ_LH_6m', 'SUM_CBALQ_LH_3m', 'AVG_GR_SUM_CBALQ_LH']
train_df_filtered = train_df_filtered.drop(columns=drop_columns)
test_df_filtered = test_df_filtered.drop(columns=drop_columns)

# Step 1: Handle missing values and ensure consistent data types
def handle_missing_values(df):
    df = df.astype(str)
    df = df.fillna("None")
    return df

# Step 2: Preprocess - Encoding categorical variables
def preprocess_data(df, target_column, transformer=None):
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    # Reset indices for alignment with LSH
    X = X.reset_index(drop=True)
    y = y.reset_index(drop=True)
    
    # Handle missing values and ensure consistent data types
    X = handle_missing_values(X)

    if transformer is None:
        transformer = ColumnTransformer(
            transformers=[
                ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), X.columns.tolist())
            ],
            remainder='passthrough'
        )
        transformer.fit(X)
    
    # Transform features
    X_transformed = transformer.transform(X)
    return X_transformed.astype(np.float32), y.astype(float), transformer

# Step 3: Create LSH Index using MinHash
def create_lsh_index(df, target_column, num_hashes=200):
    X, y, transformer = preprocess_data(df, target_column)

    # Step 3.1: MinHash LSH index
    lsh = MinHashLSH(threshold=0.9, num_perm=num_hashes)  # Threshold for similarity
    minhashes = {}
    
    for i, vector in enumerate(X):
        minhash = MinHash(num_perm=num_hashes)
        # Create MinHash from vector (convert vector to set of hashes)
        for val in vector:
            minhash.update(str(val).encode('utf8'))
        minhashes[i] = minhash
        lsh.insert(i, minhash)  # Insert the MinHash into the LSH index
    
    return lsh, X, y, transformer

# Step 4: Predict using LSH
def predict_lsh_parallel(df, target_column, lsh, transformer, y_train, n_neighbors=20, n_jobs=4):
    X, y, _ = preprocess_data(df, target_column, transformer)

    # Function to process a single observation
    def process_vector(test_vector):
        # Convert vector to MinHash
        minhash = MinHash(num_perm=200)
        for val in test_vector:
            minhash.update(str(val).encode('utf8'))
        
        # Find nearest neighbors using LSH
        result = lsh.query(minhash)  # Retrieve similar hash values
        similar_train_data = y_train.iloc[result]  # Use y_train directly
        return similar_train_data.mean()  # Average for prediction
    
    # Parallelize using ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=n_jobs) as executor:
        y_pred = list(executor.map(process_vector, X))
    
    return np.array(y_pred)

# Step 5: Train the model
target_column = 'IS_BANCAS'
lsh, _, y_train, transformer = create_lsh_index(train_df, target_column)

# Save the transformer, y_train, and prediction function using dill
with open('results/collaborative_search_model_with_lsh.pkl', 'wb') as f:
    dill.dump({
        'transformer': transformer,
        'y_train': y_train,
        'predict_lsh_parallel': predict_lsh_parallel  # Save the function
    }, f)

# Step 6: Testing

# On training data
y_pred_col_train = predict_lsh_parallel(train_df, target_column, lsh, transformer, y_train)
y_pred_col_train_round = y_pred_col_train.round()

# On testing data
y_pred_col_test = predict_lsh_parallel(test_df, target_column, lsh, transformer, y_train)
y_pred_col_test_round = y_pred_col_test.round()
y_test = test_df[target_column]

# Reporting

# On training data
print('\nTESTING ON TRAINING DATA:\n')

accuracy_col_train = accuracy_score(y_train, y_pred_col_train_round)
accuracy_col_train = round(accuracy_col_train, 4)
roc_auc_score_col_train = roc_auc_score(y_train, y_pred_col_train)
gini_col_train = 2 * roc_auc_score_col_train - 1

print('Model Accuracy:', str(accuracy_col_train * 100))
print('\nClassification Report:')
print(classification_report(y_train, y_pred_col_train_round))
print("ROC AUC Score:", roc_auc_score_col_train.round(2))
print("Gini Index:", gini_col_train.round(2))

# On testing data
print('\nTESTING ON TESTING DATA:\n')

accuracy_col_test = accuracy_score(y_test, y_pred_col_test_round)
accuracy_col_test = round(accuracy_col_test, 4)
roc_auc_score_col_test = roc_auc_score(y_test, y_pred_col_test)
gini_col_test = 2 * roc_auc_score_col_test - 1

print('Model Accuracy:', str(accuracy_col_test * 100) + '%')
print('\nClassification Report:')
print(classification_report(y_test, y_pred_col_test_round))
print("ROC AUC Score:", roc_auc_score_col_test.round(2))
print("Gini Index:", gini_col_test.round(2))

# ROC Curves

# Train data
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_pred_col_train)
roc_auc_train = auc(fpr_train, tpr_train)
fpr_train = [0] + list(fpr_train)
tpr_train = [0] + list(tpr_train)

# Test data
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_pred_col_test)
roc_auc_test = auc(fpr_test, tpr_test)
fpr_test = [0] + list(fpr_test)
tpr_test = [0] + list(tpr_test)

# Colours
background_color = '#181926'
text_colour = "#cad3f5"
axis_colour = "#b8c0e0"
guess_colour = '#8aadf4'
roc_train_colour = "#f0c6c6"
roc_test_colour = "#91d7e3"

# Setting up the plot
plt.figure(figsize=(10, 8), facecolor=background_color)
ax = plt.gca()  # Get the current Axes
ax.set_facecolor(background_color)  # Set the background color of the Axes

# Customize axis colors
ax.tick_params(axis='x', colors=axis_colour)  # Set x-axis tick color
ax.tick_params(axis='y', colors=axis_colour)  # Set y-axis tick color
ax.spines['bottom'].set_color(axis_colour)  # Set bottom spine color
ax.spines['left'].set_color(axis_colour)    # Set left spine color
ax.spines['top'].set_color(axis_colour)  # Set bottom spine color
ax.spines['right'].set_color(axis_colour)    # Set left spine color

# Plotting

# Plot the ROC curve
plt.plot(fpr_train, tpr_train, color=roc_train_colour, label=f'Train Data ROC Curve (AUC = {roc_auc_train:.2f})')
plt.plot(fpr_test, tpr_test, color=roc_test_colour, label=f'Test data ROC Curve (AUC = {roc_auc_test:.2f})')
plt.plot([0, 1], [0, 1], color=guess_colour, linestyle='--', label='Random Guess')
plt.xlabel('False Positive Rate', color=text_colour)  # White text for better contrast
plt.ylabel('True Positive Rate', color=text_colour)  # White text for better contrast
plt.title('Next Best Action Receiver Operating Characteristic (ROC) Curve', color=text_colour)
plt.legend(loc='lower right', facecolor=background_color, edgecolor=text_colour, labelcolor=text_colour)
plt.grid(alpha=0.1, color=axis_colour)  # Adjust grid line color for visibility

# Set the x and y limits to start at 0
plt.xlim(0, 1)  # x-axis starts at 0
plt.ylim(0, 1)  # y-axis starts at 0

plt.savefig('results/NBA_ROC.png')
plt.show()