**Table of contents**<a id='toc0_'></a>    
- [Prepare the notebook](#toc1_)    
  - [Import necessary libraries](#toc1_1_)    
  - [Import the datasets](#toc1_2_)    
- [PCA](#toc2_)    
- [Distributional approach](#toc3_)    
- [Connectivity approach](#toc4_)    
- [One-class SVM](#toc5_)    
- [Isolation forest](#toc6_)    
  - [Get the final list of 'outlier' columns get getting the columns that were identified by a majority of tests](#toc6_1_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Prepare the notebook](#toc0_)

## <a id='toc1_1_'></a>[Import necessary libraries](#toc0_)

In [None]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install outlier_utils
!pip install plotly

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.mixture import GaussianMixture

## <a id='toc1_2_'></a>[Import the datasets](#toc0_)

We will load the dataset with the already imputed values. This prevents us from having to ignore a ton of rows (since the outlier detection tests cannot run if there are missing values, we would need to drop the rows)

In [None]:
# Load the dataset
df_races = pd.read_csv('../dataset/df_races_no_missing.csv.zip')

In [None]:
pd.set_option('display.max_columns', 100) 
pd.set_option('display.max_rows', 100)

# <a id='toc2_'></a>[PCA](#toc0_)

In [None]:
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping to multiple lines

def pca_outlier_contribution_and_plot_3d(df, columns, n_components=3, threshold=2.5):
    """
    Perform PCA-based outlier detection and return a DataFrame with outlier information.
    
    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        columns (list): A list of column names to apply PCA outlier detection.
        n_components (int): Number of principal components to use (default: 3).
        threshold (float): The reconstruction error threshold for identifying outliers (default: 2.5).
    
    Returns:
        pd.DataFrame: Modified DataFrame with reconstruction error and outlier flag.
    """
    # Ensure only the specified columns are selected
    df_numerical = df[columns]
    
    # Drop rows with missing values (NaN) in the selected columns
    df_numerical_cleaned = df_numerical.dropna()
    
    # Standardize the numerical data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_numerical_cleaned)
    
    # Apply PCA to reduce the dimensionality of the numerical data
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X_scaled)
    
    # Reconstruct the data from the principal components
    X_reconstructed = pca.inverse_transform(X_pca)
    
    # Compute the reconstruction error for each feature (column-wise error)
    column_wise_error = (X_scaled - X_reconstructed) ** 2  # Error for each column
    
    # Compute the overall reconstruction error (sum across columns)
    reconstruction_error = np.sum(column_wise_error, axis=1)
    
    # Flag points with high overall reconstruction error as outliers
    outliers = reconstruction_error > threshold
    
    # Create a copy of the original DataFrame to store outlier information
    df_copy = df.loc[df_numerical_cleaned.index].copy()  # Only keep rows without missing values
    
    # Add the reconstruction error and outlier flag to the copy of the DataFrame
    df_copy['reconstruction_error'] = reconstruction_error
    df_copy['pca_outlier'] = outliers
    
    # Compute and display PCA loadings (weights of each feature on each principal component)
    loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(n_components)], index=columns)
    
    # Show the loadings
    print("PCA Loadings (weights of each feature on the principal components):")
    print(loadings)
    
    # Create a 3D interactive plot using Plotly
    fig = px.scatter_3d(
        x=X_pca[:, 0],  # Principal Component 1
        y=X_pca[:, 1],  # Principal Component 2
        z=X_pca[:, 2],  # Principal Component 3
        title='3D PCA Manifold with Outliers',
        labels={'x': 'PC1', 'y': 'PC2', 'z': 'PC3'},
        opacity=0.7,
    )
    
    fig.update_traces(marker=dict(size=5))
    fig.show()
    
    # If a row is an outlier, find which columns contributed most to the error
    contribution = pd.DataFrame(column_wise_error, columns=df_numerical_cleaned.columns, index=df_numerical_cleaned.index)
    
    # Return DataFrame with outlier information and column-wise contributions
    return df_copy, contribution, loadings

In [None]:
# selected_columns = ['points', 'uci_points', 'length', 'climb_total', 'profile', 'startlist_quality', 'average_temperature', 'cyclist_age']  # Exclude 'position' and any irrelevant columns
selected_columns = ['points', 'uci_points', 'length', 'climb_total', 'profile', 'startlist_quality', 'cyclist_age']  # Exclude 'position' and any irrelevant columns
df_with_outliers, contribution_df, loadings_df = pca_outlier_contribution_and_plot_3d(df_races, selected_columns, n_components=3, threshold=15)

# Display outliers with their overall reconstruction error
print(df_with_outliers[df_with_outliers['pca_outlier']])

# Display contribution of each column for the outlier rows
outliers_contributions = contribution_df.loc[df_with_outliers[df_with_outliers['pca_outlier']].index]
print(outliers_contributions)

# Show the loadings (weights of each feature on each principal component)
print(loadings_df)

# Drop the reconstruction_error column since we don't want it from now on
df_with_outliers.drop('reconstruction_error', axis=1, inplace=True)


# <a id='toc3_'></a>[Distributional approach](#toc0_)

In [None]:
def gmm_test(df):
    # Select only numeric columns from df
    numeric_columns = df.select_dtypes(include="number")
    numeric_data = numeric_columns.dropna()  # Drop rows with missing numeric values. Consider imputation if needed.

    # Fit the Gaussian Mixture Model to the numeric data
    k = 5
    algorithm = GaussianMixture(n_components=k, random_state=42)
    algorithm.fit(numeric_data.values)

    # Predict probabilities and compute outlier scores
    outlier_degrees_per_normal = algorithm.predict_proba(numeric_data.values)
    outlier_degrees = outlier_degrees_per_normal.max(axis=1)  # Higher values indicate more normal, lower values more likely outlier

    # Create DataFrame of outlier degrees
    outlier_scores_df = pd.DataFrame(outlier_degrees, index=numeric_data.index, columns=["density_degree"])

    # Define threshold (e.g., below the 5th percentile could be outliers)
    threshold = outlier_scores_df["density_degree"].quantile(0.05)
    outlier_series = pd.Series(False, index=df.index)
    outlier_series[numeric_data.index] = outlier_scores_df["density_degree"] < threshold

    # Merge the outlier degree and boolean outlier column back into the original DataFrame
    df["density_degree"] = outlier_scores_df["density_degree"]
    df["gmm_outlier"] = outlier_series

    # Sort the DataFrame by "density_degree" in ascending order for review
    df_sorted = df.sort_values(by="density_degree", ascending=True)

    # Plot sorted outlier degrees
    sorted_outlier_degrees = sorted(outlier_degrees)
    sns.lineplot(
        x=range(len(sorted_outlier_degrees)),
        y=sorted_outlier_degrees
    )

    # Return the modified DataFrame
    return df_sorted

In [None]:
df_with_outliers = gmm_test(df_with_outliers)

outlier_rows = df_with_outliers[df_with_outliers["gmm_outlier"] == True]
print(outlier_rows)

# Drop the density degree column since we don't want it from now on
df_with_outliers.drop('density_degree', axis=1, inplace=True)

# <a id='toc4_'></a>[Connectivity approach](#toc0_)

In [None]:
def connectivity_approach(df):
    import seaborn as sns
    from sklearn.neighbors import LocalOutlierFactor

    # Select only numeric columns from df
    numeric_columns = df.select_dtypes(include="number")
    numeric_data = numeric_columns.dropna()  # Drop rows with missing numeric values. Consider imputation if needed.

    # Initialize the Local Outlier Factor model
    k = 25
    algorithm = LocalOutlierFactor(n_neighbors=k)

    # Fit the model and predict outliers only on the rows without missing values
    outlier_predictions = algorithm.fit_predict(numeric_data)

    # Create a Series with the same index as the original DataFrame, defaulting to False
    outlier_series = pd.Series(False, index=df.index)

    # Assign True where the rows are outliers
    outlier_series[numeric_data.index] = (outlier_predictions == -1)

    # Add the "connectivity_approach" column to the original DataFrame
    df["connectivity_approach_outlier"] = outlier_series

    # Negated outlier scores: the higher, the more of an outlier
    outlier_degrees = -algorithm.negative_outlier_factor_
    sorted_outlier_degrees = sorted(outlier_degrees)

    # to achieve comparable degrees among different algorithms,
    # we normalize distance-based scores by the maximum possible distance
    # distances = squareform(pdist(data_only_dataset.values))
    # maximum_possible_radius = distances.max()
    # normalization_factor = maximum_possible_radius / k
    # normalized_sorted_outlier_degrees = sorted_outlier_degrees / normalization_factor
    # dataset_with_outlier_scores.loc[:, "lof_degree_normalized"] = outlier_degrees / normalization_factor

    # Plot the sorted outlier degrees
    sns.lineplot(
        x=range(len(sorted_outlier_degrees)),
        y=sorted_outlier_degrees
    )

    # Return the modified DataFrame
    return df

In [None]:
df_with_outliers = connectivity_approach(df_with_outliers)

outlier_rows = df_with_outliers[df_with_outliers["connectivity_approach_outlier"] == True]
print(outlier_rows)

# <a id='toc5_'></a>[One-class SVM](#toc0_)

In [None]:
def one_class_svm(df):
    from sklearn.svm import OneClassSVM

    # Select only numeric columns from df
    numeric_columns = df.select_dtypes(include="number")
    numeric_data = numeric_columns.dropna()  # Drop rows with missing numeric values. Consider imputation if needed.

    # Initialize the OneClassSVM model
    kernel = "linear"
    algorithm = OneClassSVM(kernel=kernel, nu=0.9)
    algorithm.fit(numeric_data)

    # Predict outliers: +1 for inliers, -1 for outliers
    outlier_classification_scores = algorithm.predict(numeric_data)
    outlier_distance_scores = algorithm.score_samples(numeric_data)

    # Add distance scores to the corresponding rows in the original DataFrame
    df.loc[numeric_data.index, "oneclass_svm_degree"] = outlier_distance_scores

    # Map outlier classification scores to a boolean column: True for outliers, False for inliers
    outlier_series = pd.Series(False, index=df.index)
    outlier_series[numeric_data.index] = (outlier_classification_scores == -1)
    df["oneclass_svm_outlier"] = outlier_series

    # If kernel is linear, retrieve support vectors
    support_vectors = numeric_data.iloc[algorithm.support_] if kernel == "linear" else None

    # Plot sorted outlier degrees
    sorted_outlier_degrees = sorted(outlier_distance_scores)
    sns.lineplot(
        x=range(len(sorted_outlier_degrees)),
        y=sorted_outlier_degrees
    )

    # Return the modified DataFrame
    return df

In [None]:
df_with_outliers = one_class_svm(df_with_outliers)

outlier_rows = df_with_outliers[df_with_outliers["oneclass_svm_outlier"] == True]
print(outlier_rows)

# Drop the oneclass_svm_degree column since we don't want it from now on
df_with_outliers.drop('oneclass_svm_degree', axis=1, inplace=True)

# <a id='toc6_'></a>[Isolation forest](#toc0_)

In [None]:
def isolation_forest(df):
    import pandas as pd
    import seaborn as sns
    from sklearn.ensemble import IsolationForest

    # Select only numeric columns from df
    numeric_columns = df.select_dtypes(include="number")
    numeric_data = numeric_columns.dropna()  # Drop rows with missing numeric values. Consider imputation if needed.

    # Initialize the Isolation Forest model
    max_features = numeric_data.shape[1]  # Use the number of features in the numeric data
    algorithm = IsolationForest(max_features=max_features)
    algorithm.fit(numeric_data)

    # Calculate outlier degrees and scores
    outlier_degrees = algorithm.decision_function(numeric_data) + 0.5
    outlier_scores = 1 - (algorithm.predict(numeric_data) + 1) / 2  # 1 for outliers, 0 for inliers

    # Assign degrees and scores to the corresponding rows in the original DataFrame
    df.loc[numeric_data.index, "isolation_forest_degree"] = outlier_degrees
    df.loc[numeric_data.index, "isolation_forest_scores"] = outlier_scores

    # Create a boolean column for outlier identification
    outlier_series = pd.Series(False, index=df.index)
    outlier_series[numeric_data.index] = (outlier_scores == 1)
    df["isolation_forest_outlier"] = outlier_series

    # Plot sorted outlier degrees
    sorted_outlier_degrees = sorted(outlier_degrees)
    sns.lineplot(
        x=range(len(sorted_outlier_degrees)),
        y=sorted_outlier_degrees
    )

    # Return the modified DataFrame
    return df

In [None]:
df_with_outliers = isolation_forest(df_with_outliers)

outlier_rows = df_with_outliers[df_with_outliers["isolation_forest_outlier"] == True]
outlier_rows

# Drop the oneclass_svm_degree column since we don't want it from now on
df_with_outliers.drop('isolation_forest_degree', axis=1, inplace=True)
df_with_outliers.drop('isolation_forest_scores', axis=1, inplace=True)

## <a id='toc6_1_'></a>[Get the final list of 'outlier' columns get getting the columns that were identified by a majority of tests](#toc0_)

In [None]:
# List of columns to check for outliers
outlier_columns = [
    'isolation_forest_outlier', 
    'oneclass_svm_outlier', 
    'connectivity_approach_outlier', 
    'gmm_outlier', 
    'pca_outlier'
]

# Select rows where at least 3 out of the 5 specified columns are True (indicating outliers)
df_outliers = df_with_outliers[df_with_outliers[outlier_columns].sum(axis=1) >= 3].drop(columns=outlier_columns)

# Select rows where fewer than 3 out of the 5 specified columns are True (indicating non-outliers)
df_non_outliers = df_with_outliers[df_with_outliers[outlier_columns].sum(axis=1) < 3].drop(columns=outlier_columns)

# Write the DataFrames to CSV files
df_outliers.to_csv('../dataset/df_races_only_outliers.csv', index=False)
df_non_outliers.to_csv('../dataset/df_races_no_outliers.csv', index=False)