In [15]:
import pandas as pd
import numpy as np
import os

os.environ["OMP_NUM_THREADS"] = "3"
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score


In [72]:


# For evaluating models
"""
# load the model accuracy results from the training data
model_accuracy_table = pd.read_csv('data/model_accuracy_training_results.csv')

# load the training and testing data

training_data = pd.read_csv('data/train_data.csv')
test_data = pd.read_csv('data/pre_test.csv')
"""

# For the final training data
model_accuracy_table = pd.read_csv('data/model_accuracy_final_training_results.csv')

training_data = pd.read_csv('data/final_training_data.csv')
test_data = pd.read_csv('data/final_test_data.csv')




In [62]:
# function to find the k nearest neighbors of a test sample

def find_k_nearest_neighbors(test_sample, train_data, k):
    # Initialize the Nearest Neighbors model
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(train_data.drop(['PassengerId', 'Survived'], axis=1))

    # Find the k nearest neighbors of the test sample
    distances, indices = neigh.kneighbors([test_sample])

    idx = train_data.iloc[indices[0]]['PassengerId'].values
    

    return idx



In [63]:
# function to find nearest neighbors based on threshold distance

def find_nearest_neighbors_by_distance(test_sample, train_data, dis):
    # Initialize the Nearest Neighbors model
    neigh = NearestNeighbors(metric = 'euclidean')
    neigh.fit(train_data.drop(['PassengerId', 'Survived'], axis=1))

    # Find a larger number of neighbors to ensure we capture enough points
    distances, indices = neigh.kneighbors([test_sample], n_neighbors = 100)

    # Filter neighbors based on the distancvce threshold
    threshold_indices = indices[distances <= dis][0]['PassengerID'].values
    print(type(threshold_indices))

    return threshold_indices



In [64]:

def find_all_neighbors_in_cluster(test_sample, train_data, k = 22):
    """
    This function aims to add the testing sample to the training data and run K-means clustering.
    It then takes all of the training data that was clustered with the testing sample, \
    and outputs a numpy array of the indices of those training samples.

    Parameters:
    - test_sample: numpy array, the feature vector of the test sample
    - train_data: numpy array, the feature vectors of the training samples
    - n_clusters: int, the number of clusters to form
    
    Returns:
    - passenger_ids: numpy array of the indices of the training samples in the same cluster as the test sample
    """

    features = train_data.drop(['PassengerId', 'Survived'], axis=1, errors = 'ignore')
    
    if isinstance(test_sample, pd.Series):
        test_sample = test_sample.values.reshape(1, -1)


    # Add the test sample to the training features
    combined_features = np.vstack([features, test_sample])

    # Perform K-means clustering
    kmeans = KMeans(n_clusters = k, random_state = 42).fit(combined_features)

    # Find the cluster of the test sample (it's the last one because we appended it)
    test_sample_cluster = kmeans.labels_[-1]
    
    # Find indices of training samples in the same cluster as the test sample
    indices_in_cluster = np.where(kmeans.labels_[:-1] == test_sample_cluster)[0]
    
    # Retrieve PassengerId values for these indices
    passenger_ids = train_data.iloc[indices_in_cluster]['PassengerId'].tolist()
    
    return passenger_ids







In [65]:
# function to calculate the weighted vote for a test sample

def weighted_vote(test_sample, model_dict, train_data, model_accuracy, k=20):
    """
    This funtion uses a basic voting strategy to classify a test sample.
    First, it finds the k nearest neighbors of the test sample in the training data.
    Then, it calculates the weighted vote for each model based on the accuracy of the model on the k nearest neighbors.
    It allows every model to vote, but the vote is weighted based on the model's accuracy.
    """

    
    # Find the IDs of the k nearest neighbors
    nearest_ids = find_all_neighbors_in_cluster(test_sample, train_data, k=9)
    

    # Filter the model accuracy results for these IDs
    nearest_performance = model_accuracy[model_accuracy['PassengerId'].isin(nearest_ids)]
    
    # Calculate the mean accuracy for each model for these nearest samples
    mean_accuracies = nearest_performance.drop(['PassengerId'], axis=1).mean()
    
    # Initialize counters for weighted votes
    weighted_votes_0 = 0
    weighted_votes_1 = 0

    for model_name, model in model_dict.items():
        # Get the binary prediction for the test sample
        pred_class = model.predict([test_sample])[0]
        # Apply the weight based on the model's accuracy to the vote
        if pred_class == 0:
            weighted_votes_0 += mean_accuracies[model_name]
        else:
            weighted_votes_1 += mean_accuracies[model_name]
    
    
    # Determine the final classification based on the highest weighted vote
    final_vote = 1 if weighted_votes_1 > weighted_votes_0 else 0   
    
    return final_vote

In [60]:
test_model(select_top_model)

Model Accuracy:  0.8491620111731844


In [66]:
def select_top_model(test_sample, model_dict, train_data, model_accuracy):
    """
    This function uses a basic selection strategy to classify a test sample.
    First, it finds the k nearest neighbors of the test sample in the training data.
    Then, it selects the top model based on the accuracy of the model on the k nearest neighbors.
    It has that model classify the training sample, and returns the result.

    NOTE this is considered a Dynamic Classifier Selection (DCS) strategy rather than an ensemble method.
    """

    #logging.debug(f"Processing test data passenger ID: {passenger_id}")
    
    # Find the IDs of the k nearest neighbors
    nearest_ids = find_k_nearest_neighbors(test_sample, train_data, k = 30)
    #nearest_ids = find_all_neighbors_in_cluster(test_sample, train_data)

    
    #logging.debug(f"K nearest neighbors: {nearest_ids}")
    
    # Filter the model accuracy results for these IDs
    nearest_performance = model_accuracy[model_accuracy['PassengerId'].isin(nearest_ids)]
    
    # Calculate the mean accuracy for each model for these nearest samples
    mean_accuracies = nearest_performance.drop(['PassengerId'], axis=1).mean()
    #logging.debug(f"Average score of each model on those K nearest neighbors: {mean_accuracies.to_dict()}")
    
    # Select the top model based on the mean accuracy
    top_model = mean_accuracies.idxmax()

    # Get the binary prediction for the test sample
    pred_class = model_dict[top_model].predict([test_sample])[0]
    return pred_class

In [34]:
def selective_weighted_vote(test_sample, model_dict, train_data, model_accuracy, k=20, thresh = 0.9):
    """ 
    This function acts as a sliding middle ground between the two above selection methods.
    It first selects the top model based on the accuracy of the model on the k nearest neighbors.
    Then, it chooses models that are with a certain threshold of the top model's accuracy.
    It only allows models that are within this threshold to vote, and the vote is weighted based on the model's accuracy.

    If the thresh is set to zero, this function will act exactly the same as the weighted_vote function.
    If the thresh is set to one, this function will act very similarly to the select_top_model function (the difference being that if there are ties for the top model, then both will be selected to vote, 
        rather than .idmax() picking the first model in the list.)
    """



    # Find the IDs of the k nearest neighbors
    nearest_ids = find_k_nearest_neighbors(test_sample, train_data, k)

    # Filter the model accuracy results for these IDs
    nearest_performance = model_accuracy[model_accuracy['PassengerId'].isin(nearest_ids)]

    # Calculate the mean accuracy for each model for these nearest samples
    mean_accuracies = nearest_performance.drop(['PassengerId'], axis=1).mean()

    # Select the top model based on the mean accuracy
    best_model_accuracy = mean_accuracies.max()

    # If the best model's accuracy is above a certain threshold, use that model
    
    eligible_models = mean_accuracies[mean_accuracies >= thresh * best_model_accuracy]

    # Initialize weighted vote counters
    weighted_votes = {0: 0, 1: 0}

    for model_name in eligible_models.index:
        model = model_dict[model_name]
        pred_class = model.predict([test_sample])[0]

        # Weight the vote by the model's mean accuracy
        weighted_votes[pred_class] += mean_accuracies[model_name]

    # Determine the final classification based on the highest weighted vote
    final_vote = 1 if weighted_votes[1] > weighted_votes[0] else 0

    return final_vote

In [67]:
from joblib import load
import os


def load_models():
    # define the model directory
    models_dir = 'models'

    model_files = {
        'ProportionCorrectLogisticRegression': 'logistic_regression_model.pkl',
        'ProportionCorrectRandomForest': 'random_forest_model.pkl',
        'ProportionCorrectXGBoost': 'xgboost_model.pkl',
        'ProportionCorrectLightBoost': 'lightboost_model.pkl',
        'ProportionCorrectCatBoost': 'catboost_model.pkl'
    }

    models = {}

    for model_name, model_file in model_files.items():
        file_path = os.path.join(models_dir, model_file)
        with open(file_path, 'rb') as file:
            models[model_name] = load(file)

    return models



In [7]:
X_test, y_test = test_data.drop(['PassengerId', 'Survived'], axis=1), test_data['Survived']

In [36]:


# Initialize the predictions
ensemble_predictions = []

# Load the models
models = load_models()

# Iterate over the test data
for index, row in X_test.iterrows():
    test_sample = row.values

    predicted_label = selective_weighted_vote(test_sample, models, training_data, model_accuracy_table)
    ensemble_predictions.append(predicted_label)

# Calculate the accuracy of the ensemble model
accuracy = accuracy_score(y_test, ensemble_predictions)

print('Model Accuracy: ', accuracy)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

In [11]:
def test_model(voting_strategy):
    # Initialize the predictions
    ensemble_predictions = []

    # Load the models
    models = load_models()

    # Iterate over the test data
    for index, row in X_test.iterrows():
        test_sample = row.values

        predicted_label = voting_strategy(test_sample, models, training_data, model_accuracy_table)
        ensemble_predictions.append(predicted_label)

    # Calculate the accuracy of the ensemble model
    accuracy = accuracy_score(y_test, ensemble_predictions)

    print('Model Accuracy: ', accuracy)

In [9]:
import warnings

# To ignore specific UserWarnings about feature names
warnings.filterwarnings("ignore", message="X does not have valid feature names")

In [None]:
test_model(weighted_vote)

In [61]:
test_model(selective_weighted_vote)

Model Accuracy:  0.8324022346368715


In [63]:
# use each model on its own to predict the test data as a benchmark
# Logistic Regression

logistic_regression_model = models['ProportionCorrectLogisticRegression']
logistic_regression_predictions = logistic_regression_model.predict(X_test)

logistic_regression_accuracy = accuracy_score(y_test, logistic_regression_predictions)

print('Logistic Regression Model Accuracy: ', logistic_regression_accuracy)


Logistic Regression Model Accuracy:  0.8156424581005587


In [64]:
# Random Forest

random_forest_model = models['ProportionCorrectRandomForest']
random_forest_predictions = random_forest_model.predict(X_test)

random_forest_accuracy = accuracy_score(y_test, random_forest_predictions)

print('Random Forest Model Accuracy: ', random_forest_accuracy)


Random Forest Model Accuracy:  0.8324022346368715


In [65]:
# XGBoost

xgboost_model = models['ProportionCorrectXGBoost']
xgboost_predictions = xgboost_model.predict(X_test)

xgboost_accuracy = accuracy_score(y_test, xgboost_predictions)

print('XGBoost Model Accuracy: ', xgboost_accuracy)

XGBoost Model Accuracy:  0.8156424581005587


In [66]:
# LightBoost

lightboost_model = models['ProportionCorrectLightBoost']
lightboost_predictions = lightboost_model.predict(X_test)

lightboost_accuracy = accuracy_score(y_test, lightboost_predictions)

print('LightBoost Model Accuracy: ', lightboost_accuracy)

LightBoost Model Accuracy:  0.8324022346368715


In [67]:
# CatBoost

catboost_model = models['ProportionCorrectCatBoost']
catboost_predictions = catboost_model.predict(X_test)

catboost_accuracy = accuracy_score(y_test, catboost_predictions)

print('CatBoost Model Accuracy: ', catboost_accuracy)

CatBoost Model Accuracy:  0.8156424581005587


In [83]:
# Final procedure to predict test data

"""
Loop through each element in test data
run KNN to find the training samples most closely related to the testing sample
find the top model for the related training data
use the top model to predict the status of the testing sample
append the passengerID and the prediction to a list

save the list to a csv file
"""

testing_predictions = []

models = load_models()

X_test, passenger_id = test_data.drop(['PassengerId'], axis=1), test_data['PassengerId']

for index, row in X_test.iterrows():
    test_sample = row.values

    predicted_label = select_top_model(test_sample, models, training_data, model_accuracy_table)
    testing_predictions.append([passenger_id[index], predicted_label])


# print the length of the predictions
print(len(testing_predictions))

# print the first 10 rows of testing_predictions
print(testing_predictions[:18])

# convert testing_predicitons to a dataframe and add the header row 'PassengerId, Survived'
testing_predictions = pd.DataFrame(testing_predictions, columns=['PassengerId', 'Survived'])
testing_predictions.to_csv('data/final_predictions.csv', index=False)



418
[[892, 0], [893, 0], [894, 0], [895, 0], [896, 1], [897, 0], [898, 1], [899, 0], [900, 1], [901, 0], [902, 0], [903, 0], [904, 1], [905, 0], [906, 1], [907, 1], [908, 0], [909, 0]]
