In [1]:
!pip install shap



In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.datasets import make_blobs

import shap
import csv
import matplotlib.pyplot as plt
import random

In [2]:
def str_to_list(string):
    return string.strip("[]").replace("'", "").split(', ')


def extract_data(filename):
    data = pd.read_csv(filename)

    col_names = data["keys"].values[0]
    col_names = str_to_list(col_names)

    vals = data['values'].map(str_to_list)
    vals = list(vals)

    index_col = data['id']

    result = pd.DataFrame(data=vals, columns=col_names, index=index_col)
    result = result.applymap(lambda val: float(val) if val != 'null' else None)

    return result

In [3]:
def create_answer(test_data, predictions, shap_values, threshold):
  indices = test_data.index
  cols = test_data.columns
  anomaly_indices = indices[predictions == -1]
  anomaly_matrix = pd.DataFrame(0, index=indices, columns=cols)
  anomaly_matrix[test_data.isnull()] = 1

  for i in range(len(shap_values)):
    for j in range(len(cols)):
      idx = anomaly_indices[i]
      col = cols[j]
      if np.abs(shap_values[i][j].values) >= threshold:
        anomaly_matrix.loc[idx, col] = 1

  return anomaly_matrix


def create_submission(anomaly_matrix, name='submission.csv'):
  res = anomaly_matrix.values.tolist()
  indices = anomaly_matrix.index
  output = [('id', 'target')]
  output.extend(zip(indices, res))

  with open(name, 'w') as f:
    writer = csv.writer(f)
    for row in output:
      writer.writerow(row)

In [4]:
data = extract_data('data/train999.csv')
data.head(10)

Unnamed: 0_level_0,meteo_layer_type,meteo_cloudiness,meteo_wind_velocity,meteo_humidity,meteo_t_underroad,meteo_freezing_point,meteo_wind_direction,meteo_dew_point,meteo_t_road,meteo_wind_gusts,meteo_t_air,meteo_air_pressure
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1684,1.0,3.0,2.3,46.7,41.4,0.0,132.0,16.9,35.1,4.2,29.5,733.0
862,1.0,2.0,2.5,29.4,32.1,0.0,351.0,3.3,30.3,4.4,22.0,737.0
1992,1.0,2.0,0.5,70.8,21.8,0.0,121.0,-4.7,18.6,1.1,0.0,754.0
889,1.0,3.0,2.5,61.6,25.7,0.0,63.0,11.2,20.7,4.2,18.7,736.0
1362,1.0,4.0,2.8,61.5,27.5,0.0,185.0,14.9,22.2,6.0,22.7,733.0
192,1.0,8.0,1.0,97.7,13.2,0.0,218.0,11.3,12.2,2.6,11.7,739.0
801,1.0,2.0,4.5,81.3,21.5,0.0,325.0,11.6,18.1,7.2,14.8,735.0
174,1.0,7.0,3.5,83.2,19.3,0.0,172.0,14.1,14.9,6.7,16.9,739.0
2023,1.0,2.0,5.2,28.9,24.4,0.0,131.0,-16.0,27.9,9.5,0.0,758.0
2646,1.0,2.0,5.3,83.5,21.0,0.0,30.0,-2.5,20.8,7.2,0.0,743.0


In [5]:
cat_cols = ['meteo_layer_type']
num_cols = [col for col in data.columns if col not in cat_cols]

In [6]:
numeric_transformer = make_pipeline(
    SimpleImputer(strategy='median'),
)

categorical_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore')
)

preprocessor = make_column_transformer(
    (numeric_transformer, num_cols),
    (categorical_transformer, cat_cols)
)

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('mod', IsolationForest(n_estimators=200, random_state=42))
])

In [7]:
X_train = data
X_test = extract_data('data/test999.csv')

In [8]:
pipe.fit(X_train)
predictions = pipe.predict(X_test)

In [9]:
explainer = shap.Explainer(pipe.predict, X_train,)

In [10]:
shap_values = explainer(X_test[predictions == -1])

PermutationExplainer explainer: 129it [11:05,  5.24s/it]


In [40]:
threshold = np.percentile(np.abs(shap_values.values.reshape(-1, 1)), 95)
threshold

1.0017499999999997

In [38]:
anomaly_matrix = create_answer(X_test, predictions, shap_values, threshold)

In [39]:
create_submission(anomaly_matrix)