**Table of contents**<a id='toc0_'></a>    
- [Prepare the notebook](#toc1_)    
  - [Import necessary libraries](#toc1_1_)    
  - [Import the datasets](#toc1_2_)    
- [PCA](#toc2_)    
- [Distributional approach](#toc3_)    
- [Connectivity approach](#toc4_)    
- [One-class SVM](#toc5_)    
- [Isolation forest](#toc6_)    
- [Get the final list of 'outlier' columns](#toc7_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Prepare the notebook](#toc0_)

## <a id='toc1_1_'></a>[Import necessary libraries](#toc0_)

In [None]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install outlier_utils
!pip install plotly

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.mixture import GaussianMixture

## <a id='toc1_2_'></a>[Import the datasets](#toc0_)

We will load the dataset with the already imputed values. This prevents us from having to ignore a ton of rows (since the outlier detection tests cannot run if there are missing values, we would need to drop the rows)

In [None]:
# Load the dataset
df_races = pd.read_csv('../dataset/df_races_no_missing.csv')
df_cyclists = pd.read_csv('../dataset/df_cyclists_no_missing.csv')

In [None]:
pd.set_option('display.max_columns', 100) 
pd.set_option('display.max_rows', 100)

# <a id='toc2_'></a>[PCA](#toc0_)

In [None]:
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping to multiple lines because it's difficult to read

def pca_outlier_contribution_and_plot_3d(df, columns, n_components=3, threshold=2.5):
    """
    Perform PCA-based outlier detection and return a DataFrame with outlier information.
    
    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        columns (list): A list of column names to apply PCA outlier detection.
        n_components (int): Number of principal components to use (default: 3).
        threshold (float): The reconstruction error threshold for identifying outliers (default: 2.5).
    
    Returns:
        pd.DataFrame: Modified DataFrame with reconstruction error and outlier flag.
    """
    # Ensure only the specified columns are selected
    df_numerical = df[columns]
    
    # Drop rows with missing values (NaN) in the selected columns. There shouldn't be any since this set
    # of tests runs after the imputations, but we'll keep this in case we want to run it before imputing missing values.
    df_numerical_cleaned = df_numerical.dropna()
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_numerical_cleaned)
    
    # Apply the PCA to reduce the dimensionality of the numerical data to the desired number of components
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X_scaled)

    X_reconstructed = pca.inverse_transform(X_pca)
    
    # Calculate the reconstruction error, and flag each component with an error higher than the threshold
    column_wise_error = (X_scaled - X_reconstructed) ** 2
    reconstruction_error = np.sum(column_wise_error, axis=1)
    outliers = reconstruction_error > threshold
    
    df_copy = df.loc[df_numerical_cleaned.index].copy()
    df_copy['reconstruction_error'] = reconstruction_error
    df_copy['pca_outlier'] = outliers
    
    # Compute and show the PCA loadings
    # This tells us which columns "contribute" the most to each dimension in our PCA space
    loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(n_components)], index=columns)
    print("PCA Loadings (weights of each feature on the principal components):")
    print(loadings)
    
    # Next create a 3d plot to visualize our components
    fig = px.scatter_3d(
        x=X_pca[:, 0],
        y=X_pca[:, 1],
        z=X_pca[:, 2],
        title='3D PCA Manifold with Outliers',
        labels={'x': 'PC1', 'y': 'PC2', 'z': 'PC3'},
        opacity=0.7,
    )
    
    fig.update_traces(marker=dict(size=5))
    fig.show()
    
    # If a row is an outlier, find which columns contributed most to the error
    # (useful to understand why a record is identified as an outlier)
    contribution = pd.DataFrame(column_wise_error, columns=df_numerical_cleaned.columns, index=df_numerical_cleaned.index)

    return df_copy, contribution, loadings

# Races DF

In [None]:
# Select all the columns we care about for the PCA
selected_columns = ['points', 'uci_points', 'length', 'climb_total', 'profile', 'startlist_quality', 'cyclist_age']
df_races_with_outliers, contribution_df, loadings_df = pca_outlier_contribution_and_plot_3d(df_races, selected_columns, n_components=3, threshold=15)

print(df_races_with_outliers[df_races_with_outliers['pca_outlier']])

outliers_contributions = contribution_df.loc[df_races_with_outliers[df_races_with_outliers['pca_outlier']].index]
print(outliers_contributions)

print(loadings_df)

# Drop the reconstruction_error column since we don't want it from now on
df_races_with_outliers.drop('reconstruction_error', axis=1, inplace=True)


# Cyclist DF

In [None]:
# Select all the columns we care about for the PCA
selected_columns = ['birth_year', 'weight', 'height'] 
df_cyclists_with_outliers, contribution_df, loadings_df = pca_outlier_contribution_and_plot_3d(df_cyclists, selected_columns, n_components=3, threshold=15)

print(df_cyclists_with_outliers[df_cyclists_with_outliers['pca_outlier']])

outliers_contributions = contribution_df.loc[df_cyclists_with_outliers[df_cyclists_with_outliers['pca_outlier']].index]
print(outliers_contributions)

print(loadings_df)

# Drop the reconstruction_error column since we don't want it from now on
df_cyclists_with_outliers.drop('reconstruction_error', axis=1, inplace=True)


# <a id='toc3_'></a>[Distributional approach](#toc0_)

In [None]:
def gmm_test(df):
    """
    Apply a Gaussian Mixture Model (GMM) to detect outliers in a DataFrame.

    This function identifies potential outliers in the numeric columns of a DataFrame using a Gaussian Mixture Model.
    Rows that fall below a specified threshold in terms of density degree are marked as outliers.

    Args:
        df (pd.DataFrame): A DataFrame containing the data to be analyzed. Only numeric columns are considered for outlier detection.

    Returns:
        pd.DataFrame: The original DataFrame with additional columns, density_degree and gmm_outlier

    Raises:
        ValueError: If there are no numeric columns in the DataFrame.
    """
    # Select only numeric columns from df
    numeric_columns = df.select_dtypes(include="number")
    # Drop rows with missing values (NaN) in the selected columns. There shouldn't be any since this set
    # of tests runs after the imputations, but we'll keep this in case we want to run it before imputing missing values.
    numeric_data = numeric_columns.dropna()

    k = 5
    algorithm = GaussianMixture(n_components=k, random_state=42)
    algorithm.fit(numeric_data.values)

    outlier_degrees_per_normal = algorithm.predict_proba(numeric_data.values)
    outlier_degrees = outlier_degrees_per_normal.max(axis=1)  # Higher values indicate more normal, lower values more likely outlier

    outlier_scores_df = pd.DataFrame(outlier_degrees, index=numeric_data.index, columns=["density_degree"])

    # Define threshold (e.g., below the 5th percentile could be outliers)
    threshold = outlier_scores_df["density_degree"].quantile(0.05)
    outlier_series = pd.Series(False, index=df.index)
    outlier_series[numeric_data.index] = outlier_scores_df["density_degree"] < threshold

    # Merge the outlier degree and boolean outlier column back into the original DataFrame
    df["density_degree"] = outlier_scores_df["density_degree"]
    df["gmm_outlier"] = outlier_series

    # Sort the DataFrame by "density_degree" in ascending order. This is useful for manual reviewing
    df_sorted = df.sort_values(by="density_degree", ascending=True)

    # Plot the result
    sorted_outlier_degrees = sorted(outlier_degrees)
    sns.lineplot(
        x=range(len(sorted_outlier_degrees)),
        y=sorted_outlier_degrees
    )

    return df_sorted

# Races

In [None]:
df_races_with_outliers = gmm_test(df_races_with_outliers)

outlier_rows = df_races_with_outliers[df_races_with_outliers["gmm_outlier"] == True]
print(outlier_rows)

# Drop the density degree column since we don't want it from now on
df_races_with_outliers.drop('density_degree', axis=1, inplace=True)

# Cyclists

In [None]:
df_cyclists_with_outliers = gmm_test(df_cyclists_with_outliers)

outlier_rows = df_cyclists_with_outliers[df_cyclists_with_outliers["gmm_outlier"] == True]
print(outlier_rows)

# Drop the density degree column since we don't want it from now on
df_cyclists_with_outliers.drop('density_degree', axis=1, inplace=True)

# <a id='toc4_'></a>[Connectivity approach](#toc0_)

In [None]:
def connectivity_approach(df):
    """
    Detect outliers in a DataFrame using the Local Outlier Factor (LOF) connectivity-based approach.

    This function applies the Local Outlier Factor (LOF) algorithm to identify outliers in the numeric columns of a DataFrame.

    Args:
        df (pd.DataFrame): A DataFrame containing the data to be analyzed. Only numeric columns are considered for outlier detection.

    Returns:
        pd.DataFrame: The original DataFrame with an additional column, "connectivity_approach_outlier".

    Raises:
        ValueError: If there are no numeric columns in the DataFrame.
    """
    import seaborn as sns
    from sklearn.neighbors import LocalOutlierFactor

    # Select only numeric columns from df
    numeric_columns = df.select_dtypes(include="number")
    # Drop rows with missing values (NaN) in the selected columns. There shouldn't be any since this set
    # of tests runs after the imputations, but we'll keep this in case we want to run it before imputing missing values.
    numeric_data = numeric_columns.dropna()

    k = 25
    algorithm = LocalOutlierFactor(n_neighbors=k)

    outlier_predictions = algorithm.fit_predict(numeric_data)

    outlier_series = pd.Series(False, index=df.index)

    outlier_series[numeric_data.index] = (outlier_predictions == -1)

    df["connectivity_approach_outlier"] = outlier_series

    outlier_degrees = -algorithm.negative_outlier_factor_
    sorted_outlier_degrees = sorted(outlier_degrees)

    # Plot the result
    sns.lineplot(
        x=range(len(sorted_outlier_degrees)),
        y=sorted_outlier_degrees
    )

    return df

# Races

In [None]:
df_races_with_outliers = connectivity_approach(df_races_with_outliers)

outlier_rows = df_races_with_outliers[df_races_with_outliers["connectivity_approach_outlier"] == True]
print(outlier_rows)

# Cyclists

In [None]:
df_cyclists_with_outliers = connectivity_approach(df_cyclists_with_outliers)

outlier_rows = df_cyclists_with_outliers[df_cyclists_with_outliers["connectivity_approach_outlier"] == True]
print(outlier_rows)

# <a id='toc5_'></a>[One-class SVM](#toc0_)

In [None]:
def one_class_svm(df):
    """
    Detect outliers in a DataFrame using the One-Class SVM approach.

    This function applies the One-Class Support Vector Machine (SVM) algorithm to identify outliers in the numeric columns of a DataFrame.

    Args:
        df (pd.DataFrame): A DataFrame containing the data to be analyzed. Only numeric columns are considered for outlier detection.

    Returns:
        pd.DataFrame: The original DataFrame with additional columns "oneclass_svm_degree" & "oneclass_svm_outlier".

    Raises:
        ValueError: If there are no numeric columns in the DataFrame.
    """
    from sklearn.svm import OneClassSVM

    # Select only numeric columns from df
    numeric_columns = df.select_dtypes(include="number")
    # Drop rows with missing values (NaN) in the selected columns. There shouldn't be any since this set
    # of tests runs after the imputations, but we'll keep this in case we want to run it before imputing missing values.
    numeric_data = numeric_columns.dropna()

    kernel = "linear"
    algorithm = OneClassSVM(kernel=kernel, nu=0.9)
    algorithm.fit(numeric_data)

    # Run the prediction: inliers will be +1 for, outliers will be -1
    outlier_classification_scores = algorithm.predict(numeric_data)
    outlier_distance_scores = algorithm.score_samples(numeric_data)

    df.loc[numeric_data.index, "oneclass_svm_degree"] = outlier_distance_scores

    # Convert the outlier classification scores to a boolean column: True for outliers (-1), False for inliers (1)
    outlier_series = pd.Series(False, index=df.index)
    outlier_series[numeric_data.index] = (outlier_classification_scores == -1)
    df["oneclass_svm_outlier"] = outlier_series

    # Plot result
    sorted_outlier_degrees = sorted(outlier_distance_scores)
    sns.lineplot(
        x=range(len(sorted_outlier_degrees)),
        y=sorted_outlier_degrees
    )

    return df

# Races

In [None]:
df_races_with_outliers = one_class_svm(df_races_with_outliers)

outlier_rows = df_races_with_outliers[df_races_with_outliers["oneclass_svm_outlier"] == True]
print(outlier_rows)

# Drop the oneclass_svm_degree column since we don't want it from now on
df_races_with_outliers.drop('oneclass_svm_degree', axis=1, inplace=True)

# Cyclists

In [None]:
df_cyclists_with_outliers = one_class_svm(df_cyclists_with_outliers)

outlier_rows = df_cyclists_with_outliers[df_cyclists_with_outliers["oneclass_svm_outlier"] == True]
print(outlier_rows)

# Drop the oneclass_svm_degree column since we don't want it from now on
df_cyclists_with_outliers.drop('oneclass_svm_degree', axis=1, inplace=True)

# <a id='toc6_'></a>[Isolation forest](#toc0_)

In [None]:
def isolation_forest(df):
    """
    Detect outliers in a DataFrame using the Isolation Forest method.

    This function applies the Isolation Forest algorithm to identify outliers in the numeric columns of a DataFrame.

    Args:
        df (pd.DataFrame): A DataFrame containing the data to be analyzed. Only numeric columns are considered for outlier detection.

    Returns:
        pd.DataFrame: The original DataFrame with additional columns: "isolation_forest_degree", "isolation_forest_scores" & "isolation_forest_outlier".

    Raises:
        ValueError: If there are no numeric columns in the DataFrame.
    """
    import pandas as pd
    import seaborn as sns
    from sklearn.ensemble import IsolationForest

    # Select only numeric columns from df
    numeric_columns = df.select_dtypes(include="number")
    # Drop rows with missing values (NaN) in the selected columns. There shouldn't be any since this set
    # of tests runs after the imputations, but we'll keep this in case we want to run it before imputing missing values.
    numeric_data = numeric_columns.dropna()

    # Use the number of numeric columns as our max_features
    max_features = numeric_data.shape[1]
    algorithm = IsolationForest(max_features=max_features)
    algorithm.fit(numeric_data)

    outlier_degrees = algorithm.decision_function(numeric_data) + 0.5
    # Outliers are 1, inliers are 0
    outlier_scores = 1 - (algorithm.predict(numeric_data) + 1) / 2

    df.loc[numeric_data.index, "isolation_forest_degree"] = outlier_degrees
    df.loc[numeric_data.index, "isolation_forest_scores"] = outlier_scores

    outlier_series = pd.Series(False, index=df.index)
    outlier_series[numeric_data.index] = (outlier_scores == 1)
    df["isolation_forest_outlier"] = outlier_series

    # Plot result
    sorted_outlier_degrees = sorted(outlier_degrees)
    sns.lineplot(
        x=range(len(sorted_outlier_degrees)),
        y=sorted_outlier_degrees
    )

    return df

# Races

In [None]:
df_races_with_outliers = isolation_forest(df_races_with_outliers)

outlier_rows = df_races_with_outliers[df_races_with_outliers["isolation_forest_outlier"] == True]
outlier_rows

# Drop the oneclass_svm_degree column since we don't want it from now on
df_races_with_outliers.drop('isolation_forest_degree', axis=1, inplace=True)
df_races_with_outliers.drop('isolation_forest_scores', axis=1, inplace=True)

# Cyclists

In [None]:
df_cyclists_with_outliers = isolation_forest(df_cyclists_with_outliers)

outlier_rows = df_cyclists_with_outliers[df_cyclists_with_outliers["isolation_forest_outlier"] == True]
outlier_rows

# Drop the oneclass_svm_degree column since we don't want it from now on
df_cyclists_with_outliers.drop('isolation_forest_degree', axis=1, inplace=True)
df_cyclists_with_outliers.drop('isolation_forest_scores', axis=1, inplace=True)

# <a id='toc7_'></a>[Get the final list of 'outlier' columns](#toc0_)

Using the columns that were added by the outlier tests, we'll drop all the rows we consider "outliers". To consider a row as an outlier, we'll assume that we want a majority of tests (3+ out of 5) indicating it as an outlier.

In [None]:
# List of 'test result' columns
outlier_columns = [
    'isolation_forest_outlier', 
    'oneclass_svm_outlier', 
    'connectivity_approach_outlier', 
    'gmm_outlier', 
    'pca_outlier'
]

# Races

In [None]:
# Select rows where at least 3 out of the 5 specified columns are True (indicating outliers)
df_outliers = df_races_with_outliers[df_races_with_outliers[outlier_columns].sum(axis=1) >= 3].drop(columns=outlier_columns)

# Select rows where fewer than 3 out of the 5 specified columns are True (indicating non-outliers)
df_non_outliers = df_races_with_outliers[df_races_with_outliers[outlier_columns].sum(axis=1) < 3].drop(columns=outlier_columns)

# Write the DataFrames to CSV files
df_outliers.to_csv('../dataset/df_races_only_outliers.csv', index=False)
df_non_outliers.to_csv('../dataset/df_races_no_outliers.csv', index=False)

# Cyclists

In [None]:
# Select rows where at least 3 out of the 5 specified columns are True (indicating outliers)
df_outliers = df_cyclists_with_outliers[df_cyclists_with_outliers[outlier_columns].sum(axis=1) >= 3].drop(columns=outlier_columns)

# Select rows where fewer than 3 out of the 5 specified columns are True (indicating non-outliers)
df_non_outliers = df_cyclists_with_outliers[df_cyclists_with_outliers[outlier_columns].sum(axis=1) < 3].drop(columns=outlier_columns)

# Write the DataFrames to CSV files
df_outliers.to_csv('../dataset/df_cyclists_only_outliers.csv', index=False)
df_non_outliers.to_csv('../dataset/df_cyclists_no_outliers.csv', index=False)