In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
os.chdir("..")

from helpers import *
from helpers_perso import *
from nan_imputation import *
from one_hot_encoding import *
from implementations import *
from standardization import *
from class_balancing import *
from remove_highly_correlated_features import *

%load_ext autoreload
%autoreload 2

Load data

In [None]:
data_path = os.path.join(os.getcwd(), "data", "dataset")
x_train, x_test, y_train, train_ids, test_ids = load_csv_data(data_path)
print("Data loaded successfully!")


## Variance across columns

In [None]:
column_variances = np.nanvar(x_train, axis=0)

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# First subplot: Full range of variances with more bins
axes[0].hist(column_variances, bins=100, edgecolor='black')
axes[0].set_title("Distribution of Variance Across Columns (Full Range)")
axes[0].set_xlabel("Variance")
axes[0].set_ylabel("Frequency")

# Second subplot: Focus on variance between 0 and 0.5
axes[1].hist(column_variances, bins=20, range=(0, 1000), edgecolor='black')
axes[1].set_title("Distribution of Variance Across Columns (0 to 1000)")
axes[1].set_xlabel("Variance")
axes[1].set_ylabel("Frequency")

# Third subplot: Focus on variance between 1 and 200
axes[2].hist(column_variances, bins=100, range=(1, 200), edgecolor='black')
axes[2].set_title("Distribution of Variance Across Columns (1 to 200)")
axes[2].set_xlabel("Variance")
axes[2].set_ylabel("Frequency")

# Display the plots
plt.tight_layout()
plt.show()

## Balancing the data

In [None]:
class1_ids = np.where(y_train == -1)[0]

x_train_majority_class = x_train[class1_ids]

# Calculate the proportion of NaN values in each column
nan_proportions = np.isnan(x_train_majority_class).mean(axis=1)

# Print the total number of columns plotted
total_columns = nan_proportions.size
print(f"Total number of columns plotted: {total_columns}")

# Print the number of columns containing NaN values
num_columns_with_nans = np.sum(nan_proportions > 0)
print(f"Number of columns containing NaN values: {num_columns_with_nans}")

# Define the bins for the histogram
bins = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]

# Calculate the histogram
hist, bin_edges = np.histogram(nan_proportions, bins=bins)

# Create the bar plot
plt.bar(range(len(hist)), hist, tick_label=[f'{int(b*100)}-{int(bins[i+1]*100)}%' for i, b in enumerate(bins[:-1])])
plt.xlabel('Proportion of NaN values')
plt.ylabel('Number of columns')
plt.title('Proportion of Nan in participants of majority class (-1)')

# Rotate the x-axis tick labels to vertical
plt.xticks(rotation=90)

plt.show()

In [None]:
print(x_train.shape)
balancing_ratio = 1.25
x_balanced, y_balanced, deleted_ids = balance_classes(x_train, y_train, balancing_ratio)
print(x_balanced.shape)

## Handling columns containing Nan

Proportion of Nan values in Nan-containing values

In [None]:
# Calculate the proportion of NaN values in each column
nan_proportions = np.isnan(x_train).mean(axis=0)

# Print the total number of columns plotted
total_columns = nan_proportions.size
print(f"Total number of columns plotted: {total_columns}")

# Print the number of columns containing NaN values
num_columns_with_nans = np.sum(nan_proportions > 0)
print(f"Number of columns containing NaN values: {num_columns_with_nans}")

# Define the bins for the histogram
bins = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]

# Calculate the histogram
hist, bin_edges = np.histogram(nan_proportions, bins=bins)

# Create the bar plot
plt.bar(range(len(hist)), hist, tick_label=[f'{int(b*100)}-{int(bins[i+1]*100)}%' for i, b in enumerate(bins[:-1])])
plt.xlabel('Proportion of NaN values')
plt.ylabel('Number of columns')
plt.title('Number of columns containing a proportion of NaN values')

# Rotate the x-axis tick labels to vertical
plt.xticks(rotation=90)

plt.show()

Thus not reasonnable to exclude columns containing Nan values
Choice : remove columns with Nan proportion superior to 80 % ?

In [None]:
# Clean all arrays by removing columns containing NaN values
x_train_cleaned, deleted_indices = remove_nan_features(x_balanced, 0.8)

adapted_x_test = np.delete(x_train, deleted_indices, axis=1)


In [None]:
integer_columns, non_integer_columns = identify_integer_columns(x_train_cleaned)

assert len(integer_columns) + len(non_integer_columns) == x_train_cleaned.shape[1]


# Print the integer columns
print(f"Number of columns containing only integer values: {len(integer_columns)}")

# Count the number of columns in integer_columns that contain at least one zero
num_columns_with_zero = sum(np.any(x_train_cleaned[:, col] == 0) for col in integer_columns)

# Print the result
print(f"Percentage of integer columns that contain at least one zero: {num_columns_with_zero/len(integer_columns)*100:.2f}%")    

(If only contains integers and no zeroes (=encoded), encode Nan as 0)

In [None]:
x_train_cleaned_without_nans = encode_nan_integer_columns(x_train_cleaned, replacement_value='mode')
x_train_cleaned_without_nans = encode_nan_continuous_columns(x_train_cleaned_without_nans, replacement_value='mode')

assert np.isnan(x_train_cleaned_without_nans).sum() == 0
assert x_train_cleaned.shape == x_train_cleaned_without_nans.shape

adapted_x_test_without_nans = encode_nan_integer_columns(adapted_x_test, replacement_value='mode')
adapted_x_test_without_nans = encode_nan_continuous_columns(adapted_x_test_without_nans, replacement_value='mode')

assert np.isnan(adapted_x_test_without_nans).sum() == 0
assert adapted_x_test.shape == adapted_x_test_without_nans.shape

In [None]:
# Calculate the number of columns that do not contain only integer values
num_non_integer_columns = len(non_integer_columns)

# Print the result
print(f"Number of columns that do not contain only integer values: {num_non_integer_columns}")

## Identify categorical features

In columns containing only integer values, number of unique values

In [None]:
unique_value_counts = np.array([len(np.unique(x_train_cleaned[:, col])) for col in integer_columns])

# Create 20 bins based on the range of unique value counts
max_unique = unique_value_counts.max() if unique_value_counts.size > 0 else 0
bins = np.linspace(0, max_unique, 21)  # 21 edges for 20 bins
bin_labels = [f'{int(b)}-{int(bins[i+1])}' for i, b in enumerate(bins[:-1])]

# Count how many columns fall into each bin
binned_counts = np.histogram(unique_value_counts, bins=bins)[0]

# Create the bar plot
plt.figure(figsize=(10, 6))
plt.bar(bin_labels, binned_counts, width=0.6, edgecolor='black', alpha=0.7)
plt.xlabel('Number of Unique Values in Columns')
plt.ylabel('Number of Columns')
plt.title('Columns Grouped by Number of Unique Values (20 Bins)')
plt.xticks(rotation=45)
plt.tight_layout()
# Create the second set of bins for values between 2000 and 0
bins_2000 = np.linspace(0, 2000, 21)  # 21 edges for 20 bins
bin_labels_2000 = [f'{int(b)}-{int(bins_2000[i+1])}' for i, b in enumerate(bins_2000[:-1])]

# Count how many columns fall into each bin for the second set of bins
binned_counts_2000 = np.histogram(unique_value_counts, bins=bins_2000)[0]

# Create the second bar plot
plt.figure(figsize=(10, 6))
plt.bar(bin_labels_2000, binned_counts_2000, width=0.6, edgecolor='black', alpha=0.7)
plt.xlabel('Number of Unique Values in Columns (0-2000)')
plt.ylabel('Number of Columns')
plt.title('Columns Grouped by Number of Unique Values (20 Bins, 0-2000)')
plt.xticks(rotation=45)
plt.tight_layout()

# Create the third set of bins for values between 0 and 100
bins_100 = np.linspace(0, 100, 21)  # 21 edges for 20 bins
bin_labels_100 = [f'{int(b)}-{int(bins_100[i+1])}' for i, b in enumerate(bins_100[:-1])]

# Count how many columns fall into each bin for the third set of bins
binned_counts_100 = np.histogram(unique_value_counts, bins=bins_100)[0]

# Create the third bar plot
plt.figure(figsize=(10, 6))
plt.bar(bin_labels_100, binned_counts_100, width=0.6, edgecolor='black', alpha=0.7)
plt.xlabel('Number of Unique Values in Columns (0-100)')
plt.ylabel('Number of Columns')
plt.title('Columns Grouped by Number of Unique Values (20 Bins, 0-100)')
plt.xticks(rotation=45)
plt.tight_layout()

plt.show()

Given number of unique value mainly in 0-5 range, lets say it's categorical if in this range

In [129]:
categorical_threshold = 5

# Step 1: Compute unique value counts for each integer column
unique_value_counts = np.array([len(np.unique(x_train_cleaned[:, col])) for col in integer_columns])

# Step 2: Identify categorical and non-categorical features based on the threshold
indexes_categorical_features = [integer_columns[i] for i, count in enumerate(unique_value_counts) if count <= categorical_threshold]
indexes_non_categorical_features = [integer_columns[i] for i in range(len(unique_value_counts)) if integer_columns[i] not in indexes_categorical_features]

assert len(indexes_categorical_features) + len(indexes_non_categorical_features) == len(unique_value_counts)
assert unique_value_counts.size == len(integer_columns)

indexes_non_categorical_features.extend(non_integer_columns)


## Standardization

In [130]:
# x_standardized = standardize_columns(x_train_cleaned_without_nans, range(x_train_cleaned_without_nans.shape[1]))

x_standardized = standardize_columns(x_train_cleaned_without_nans, indexes_non_categorical_features)

x_test_standardized = standardize_columns(adapted_x_test_without_nans, indexes_non_categorical_features)

## Binary encoding

In [131]:
# encoded_x_train = binary_encode_columns(x_standardized, indexes_categorical_features)

encoded_x_train, encoded_x_test = consistent_binary_encode(x_standardized, x_test_standardized, indexes_categorical_features)

In [None]:
print(encoded_x_train.shape)
print(encoded_x_test.shape)

## Removing Highly correlated features

In [None]:
# Manual Feature Selection Based on Correlation:
X_ici = encoded_x_train
y_ici = y_balanced

initial_w = np.zeros(X_ici.shape[1])
max_iters = 30
gamma = 0.01

# Sample usage
X_reduced, removed_features = remove_highly_correlated_features(X_ici, threshold=0.9) # 0.9=high, 0.8=moderate, 0.5-0.7=low
print(X_ici.shape)
print("Reduced feature matrix shape:", X_reduced.shape)

w,loss = least_squares(y_ici, X_reduced)
print(loss)

w,loss = ridge_regression(y_ici, X_reduced, 0.1)
print(loss)

## Running model


In [None]:
X_ici = encoded_x_train

y_ici = y_balanced

# Split the data into training and testing sets manually
# Shuffle the rows of X_ici and y_ici in the same way
shuffled_indices = np.random.permutation(X_ici.shape[0])
X_ici = X_ici[shuffled_indices]
y_ici = y_ici[shuffled_indices]

split_index = int(0.8 * X_ici.shape[0])
X_train_estim, X_test_estim = X_ici[:split_index], X_ici[split_index:]
y_train_estim, y_test_estim = y_ici[:split_index], y_ici[split_index:]

# linear regression
initial_w = np.zeros(X_train_estim.shape[1])
max_iters = 150
gamma = 0.03
# print (least_squares(y_ici, X_ici))
# w, loss = mean_squared_error_gd(y_ici, X_ici, initial_w, max_iters, gamma)


w, loss = logistic_regression(y_train_estim, X_train_estim, initial_w, max_iters, gamma)

# percentages_to_drop = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
# nan_values_for_integer_columns = ['mode', 'upper', 'zero']
# nan_values_for_continuous_columns = ['mean', 'mode', 'zero']

## Plot the reprojection

In [None]:
# for log reg
y_pred = sigmoid(np.dot(X_train_estim, w))
maj_class = np.sum((y_pred >= 0) & (y_pred <= 0.5))
min_class = np.sum((y_pred > 0.5) & (y_pred <= 1))
ratio = maj_class/min_class
print(maj_class, min_class, ratio)

plt.text(0.95, 0.95, f'Ratio: {ratio:.2f}, expected {balancing_ratio}', transform=plt.gca().transAxes, 
        fontsize=12, verticalalignment='top', horizontalalignment='right', 
        bbox=dict(facecolor='white', alpha=0.5))
plt.hist(y_pred, bins=50, edgecolor='black')
plt.title(f'Reprojection of x_train before label prediction, balancing ratio {balancing_ratio}')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.xlim(0, 1)
plt.show()

In [None]:
# for linear
y_pred = np.dot(X_train_estim, w)
neg_values = np.sum(y_pred >= 0)
pos_values = np.sum(y_pred < 0.5)
ratio = neg_values/pos_values
print(neg_values, pos_values, ratio)

plt.text(0.95, 0.95, f'Ratio: {ratio:.2f}, expected {balancing_ratio}', transform=plt.gca().transAxes, 
        fontsize=12, verticalalignment='top', horizontalalignment='right', 
        bbox=dict(facecolor='white', alpha=0.5))
plt.hist(y_pred, bins=50, edgecolor='black')
plt.title(f'Reprojection of x_train before label prediction, balancing ratio {balancing_ratio}')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.xlim(-2, 2)
plt.show()

## Compute metrics

In [None]:
from prediction_score import *
accuracy, f1_score = compute_scores(y_test_estim, X_test_estim, w)
print(accuracy, f1_score)