# **Preprocessing data**
In this file, the preprocessing of the data is performed.

### Feature correlation for determining number of redundant features

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Caculate the correlation matrix
correlation_matrix = features.corr()

#Identify highly correlated features
threshold = 0.88
upper = correlation_matrix.where( 
    np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)) #only upper triangle of the matrix (to avoid checking each pair twice)

to_drop = set()
for column in upper.columns:
    high_corr = upper[column][abs(upper[column]) > threshold].index.tolist() # checks for correlation above the threshold
    for correlated_feature in high_corr:
        if correlated_feature not in to_drop and column not in to_drop:
            to_drop.add(correlated_feature)

print(f"Number of features with correlation to drop (one per correlated pair for which correlation > {threshold}): {len(to_drop)}")

# Drop redundant features
subset_of_features_reduced = subset_of_features.drop(columns=to_drop)
print(f"Number of features that remain: {len(subset_of_features_reduced.columns)}")

# Splitting the data into training, validation and testing sets
x_train, x_test, y_train, y_test = train_test_split(subset_of_features_reduced, labels, test_size=0.2, random_state=42, stratify=labels)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

# Train Random Forest with class balancing to determine optimal threshold
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'  
)
rf_model.fit(x_train, y_train)

# Predict and evaluate
y_pred = rf_model.predict(x_val)
y_prob = rf_model.predict_proba(x_val)[:, 1]

# Metrics
print("Classification Report:")
print(classification_report(y_val, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))

print("\nROC AUC Score:", roc_auc_score(y_val, y_prob))

NameError: name 'features' is not defined

In [None]:
# Visualisation of threshold for dropping correlated features
correlation_data = {
    'threshold': [0.85, 0.87, 0.88, 0.9, 0.95, 0.99],
    'features_remaining': [1530, 1896, 2069, 2521, 4231, 7929],
    'f1_score_0': [0.91, 0.91, 0.91, 0.91, 0.91, 0.90],
    'f1_score_1': [0.15, 0.15, 0.22, 0.22, 0.22, 0.08],
}
correlation_df = pd.DataFrame(correlation_data)

# Create main plot with secondary y-axis
fig, ax1 = plt.subplots(figsize=(8, 4))

# Plot F1 scores
ax1.plot(correlation_df['threshold'], correlation_df['f1_score_0'], marker='o', label='F1 Score - Class 0', color='blue')
ax1.plot(correlation_df['threshold'], correlation_df['f1_score_1'], marker='o', label='F1 Score - Class 1', color='green')
ax1.set_xlabel("Correlation Threshold")
ax1.set_ylabel("F1 Score")
ax1.legend(loc='upper left', bbox_to_anchor=(0, 0.9))
ax1.grid(True)

# Plot features remaining on second y-axis
ax2 = ax1.twinx()
ax2.plot(correlation_df['threshold'], correlation_df['features_remaining'], marker='s', linestyle='--', color='gray', label='Features Remaining')
ax2.set_ylabel("Number of Features")
ax2.legend(loc='upper right', bbox_to_anchor=(1, 0.9))

# Title and layout
plt.title("F1 Scores and Number of Features vs Correlation Threshold")
plt.tight_layout()
plt.show()