In [None]:
def one_class_svm(df):
    """
    Detect outliers in a DataFrame using the One-Class SVM approach.

    This function applies the One-Class Support Vector Machine (SVM) algorithm to identify outliers in the numeric columns of a DataFrame.

    Args:
        df (pd.DataFrame): A DataFrame containing the data to be analyzed. Only numeric columns are considered for outlier detection.

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: The original DataFrame with additional columns "oneclass_svm_degree" & "oneclass_svm_outlier",
        and a DataFrame containing feature contributions for each sample.
    """
    from sklearn.linear_model import SGDOneClassSVM
    from sklearn.preprocessing import StandardScaler
    import numpy as np
    import pandas as pd
    import seaborn as sns

    # Select only numeric columns from df
    numeric_columns = df.select_dtypes(include="number")
    if numeric_columns.empty:
        raise ValueError("No numeric columns found in the DataFrame.")

    # Drop rows with missing values in numeric columns
    numeric_data = numeric_columns.dropna()
    index = numeric_data.index

    # **Scale the data**
    scaler = StandardScaler()
    numeric_data_scaled = scaler.fit_transform(numeric_data)
    feature_names = numeric_data.columns

    # Initialize the SGDOneClassSVM algorithm
    algorithm = SGDOneClassSVM(nu=0.05, random_state=42, max_iter=1000, tol=1e-3)
    algorithm.fit(numeric_data_scaled)

    # **Alternative convergence check**
    if algorithm.n_iter_ == algorithm.max_iter:
        print("Warning: The model may not have converged. Consider increasing max_iter or adjusting tol.")

    # Run the prediction
    outlier_classification_scores = algorithm.predict(numeric_data_scaled)
    score_samples = algorithm.score_samples(numeric_data_scaled)
    offset = algorithm.offset_[0]  # Extract scalar value
    outlier_distance_scores = score_samples - offset

    # Store the decision function values
    df.loc[index, "oneclass_svm_degree"] = outlier_distance_scores

    # Identify outliers
    df.loc[index, "oneclass_svm_outlier"] = (outlier_classification_scores == -1)

    # **Compute feature contributions**
    # Get the weight vector (coefficients)
    weights = algorithm.coef_[0]

    # Compute contributions for each sample
    contributions = numeric_data_scaled * weights  # Element-wise multiplication
    contributions_df = pd.DataFrame(contributions, columns=feature_names, index=index)

    # Compute the raw scores (before applying offset)
    raw_scores = contributions_df.sum(axis=1)

    # Verify that raw_scores minus offset equals the decision function
    differences = outlier_distance_scores - (raw_scores - offset)
    max_difference = np.abs(differences).max()
    if max_difference > 1e-6:
        print(f"Warning: Max difference between decision function and contributions: {max_difference}")

    # **Include offset in contributions**
    contributions_df["offset"] = -offset  # Now offset is a scalar
    contributions_df["total_contribution"] = contributions_df.sum(axis=1)

    return df, contributions_df


In [None]:
df_races_with_svm_outlier_column, contributions = one_class_svm(df_scaled)

outlier_rows = df_races_with_svm_outlier_column[df_races_with_svm_outlier_column["oneclass_svm_outlier"] == True]
print(outlier_rows)
print(contributions)

# Drop the oneclass_svm_degree column since we don't want it from now on
df_races_with_svm_outlier_column.drop('oneclass_svm_degree', axis=1, inplace=True)

# Bring 'oneclass_svm_outlier' column back into the original df
df['oneclass_svm_outlier'] = df_races_with_svm_outlier_column['oneclass_svm_outlier']

print(df[df['oneclass_svm_outlier']])