# L3: Cohort analysis and other error analysis methods

## Setup

In [1]:
import numpy as np

np.random.seed(41)


import shap

X, y = shap.datasets.adult()

In [2]:
X.sample(5)

Unnamed: 0,Age,Workclass,Education-Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country
7476,41.0,4,8.0,2,6,4,4,1,0.0,0.0,40.0,39
29637,27.0,4,13.0,0,4,0,4,1,0.0,0.0,52.0,39
1646,33.0,4,10.0,2,7,4,2,1,0.0,0.0,40.0,39
32449,44.0,5,14.0,2,12,4,4,1,0.0,0.0,50.0,0
3820,24.0,6,13.0,4,5,1,4,1,0.0,0.0,50.0,39


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=7)

In [4]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [5]:
knn.score(X_valid, y_valid)

0.8464609243052357

## Groupings into cohorts

In [None]:
# Based on sex

In [6]:
X_valid_reindex = X_valid.reset_index().drop("index", axis=1)

In [7]:
X_valid_males = X_valid_reindex[X_valid_reindex["Sex"] == 1]
X_valid_females = X_valid_reindex[X_valid_reindex["Sex"] == 0]

In [9]:
knn.score(X_valid_males, y_valid[X_valid_males.index])

0.8111904213677181

In [10]:
knn.score(X_valid_females, y_valid[X_valid_females.index])

0.9170506912442397

In [None]:
# Based on number of years of education

In [11]:
X_valid_reindex = X_valid.reset_index().drop("index", axis=1)

In [15]:
X_valid_uni = X_valid_reindex[X_valid_reindex["Education-Num"] > 12]
X_valid_hs = X_valid_reindex[X_valid_reindex["Education-Num"] <= 12]

In [16]:
knn.score(X_valid_uni, y_valid[X_valid_uni.index])

0.77897403419886

In [17]:
knn.score(X_valid_hs, y_valid[X_valid_hs.index])

0.8680583704904743

In [None]:
# Based on age brackets

In [11]:
X_valid_reindex = X_valid.reset_index().drop("index", axis=1)

In [28]:
X_valid_to_35 = X_valid_reindex[(X_valid_reindex["Age"] <= 35) & (X_valid_reindex["Age"] > 18)]
X_valid_to_55 = X_valid_reindex[X_valid_reindex["Age"] <= 55]
X_valid_over_55 = X_valid_reindex[X_valid_reindex["Age"] > 55]

In [30]:
knn.score(X_valid_to_35, y_valid[X_valid_to_35.index])

0.9147121535181236

In [31]:
knn.score(X_valid_to_55, y_valid[X_valid_to_55.index])

0.8526371123182057

In [29]:
knn.score(X_valid_over_55, y_valid[X_valid_over_55.index])

0.8027295285359801

## Manifold-like groupings

In [None]:
# %load manifold_perf_comp.py
# Manifold:
# 1. Predict classes proba
# 2. Loss
# 3. Cluster on loss
# 4. Group train features by clusters
# 5. Find why model didn't work well on these

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.cluster import KMeans
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from sklearn.metrics import log_loss

import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
import pandas as pd

import argparse

parser = argparse.ArgumentParser()
parser.add_argument("-n", "--n-clusters", type=int)
parser.add_argument("--cluster-on", type=str, choices=["all", "first", "with_features", "just_features"])
# support custom segments
args = parser.parse_args()

X, y = load_breast_cancer(return_X_y=True)

X = np.concatenate([X, X + 0.1 * X.std(axis=0), X + 0.15 * X.std(axis=0)])
y = np.concatenate([y, y, y])

X_train, X_test, y_train, y_test = train_test_split(X, y)


model1 = LogisticRegression()
model2 = GaussianNB()
model3 = LogisticRegression(penalty="l1", solver="saga")

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)

def make_analysis(models, X_test, y_test, loss=log_loss, labels=[0, 1], args=args):
    losses = []
    for model in models:
        y_hat = model.predict_proba(X_test)
        losses.append(np.array([loss([t], [p], labels=labels)
                                   for t, p in zip(y_test, y_hat)]).reshape(-1, 1))

    losses = np.hstack(losses)

    cluster = KMeans(n_clusters=args.n_clusters)
    if args.cluster_on == "first":
        segments = cluster.fit_predict(losses[:, 0].reshape(-1, 1))
    elif args.cluster_on == "all":
        segments = cluster.fit_predict(losses)
    elif args.cluster_on == "with_features":
        segments = cluster.fit_predict(np.hstack([X_test, losses]))
    elif args.cluster_on == "just_features":
        segments = cluster.fit_predict(np.hstack([X_test, losses]))


    data = pd.DataFrame({
        "data_index": np.hstack([i for i, _ in enumerate(y_test)] * len(models)),
        "losses": np.hstack([losses[:, i] for i in range(len(models))]),
        "segments": np.hstack([segments.astype(str)] * len(models)),
        "models": [n for ns in [[f"{model}"] * len(losses) for model in models]
                        for n in ns]
       })

    return data


data = make_analysis([model1, model3], X_test, y_test, loss=log_loss, labels=[0,1], args=args)

sns.violinplot(x="segments", y="losses", data=data, split=True, hue="models", scale="width")
plt.show()


def make_feature_segments(X, data):
    data.index = [i for i in range(len(X))] * (len(data.values) // len(X))
    return {seg: X[np.unique(data[data.segments == seg].index)] for seg in np.unique(data.segments)}


print(make_feature_segments(X_test, data))


def compare_segments(group_1, group_2, segments_dict):
    group_1_features, group_2_features = [], []
    for segment, features in segments_dict.items():
        if segment in group_1:
            group_1_features.append(features)
        elif segment in group_2:
            group_2_features.append(features)

    if len(group_1_features) > 1:
        group_1_features = np.vstack(group_1_features)
    else:
        group_1_features = group_1_features[0]

    if len(group_2_features) > 1:
        group_2_features = np.vstack(group_2_features)
    else:
        group_2_features = group_2_features[0]

    dfl = list()
    
    if group_1_features.shape[0] > group_2_features.shape[0]:
        temp = np.zeros_like(group_1_features)
        temp[0:group_2_features.shape[0], :] = group_2_features
        group_2_features = temp
    else:
        temp = np.zeros_like(group_2_features)
        temp[0:group_1_features.shape[0], :] = group_1_features
        group_1_features = temp


    for feature_idx in range(group_2_features.shape[-1]):
        df = pd.DataFrame({
            "row": feature_idx,
            "group_1_value": group_1_features[:, feature_idx],
            "group_2_value": group_2_features[:, feature_idx]})
        dfl.append(df)

    df = pd.concat(dfl)

    g = sns.FacetGrid(df, row="row", sharex=False, sharey=False)
    g.map(sns.kdeplot, "group_1_value", color="r")
    g.map(sns.kdeplot, "group_2_value", color="k")

    return g

g = compare_segments(["0"], ["1", "2"], make_feature_segments(X_test, data))
plt.show()




## kNN search in feature and target space

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

from sklearn.metrics import log_loss

import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
import pandas as pd


In [39]:
q_knn = NearestNeighbors()
q_knn.fit(np.hstack([X_valid.values, y_valid.reshape(-1, 1)]))

NearestNeighbors()

In [53]:
# Most similar to X

In [41]:
X_valid.iloc[0]

Age                39.0
Workclass           4.0
Education-Num      10.0
Marital Status      0.0
Occupation          4.0
Relationship        1.0
Race                4.0
Sex                 0.0
Capital Gain        0.0
Capital Loss      625.0
Hours per week     40.0
Country            39.0
Name: 18329, dtype: float64

In [45]:
query = [np.hstack([X_valid.iloc[0].values, y_valid[0]])]

candidate_dist, candidate_idx = q_knn.kneighbors(query, n_neighbors=5, return_distance=True)

In [51]:
X_valid.iloc[candidate_idx.flatten()[1:]]

Unnamed: 0,Age,Workclass,Education-Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country
14794,39.0,1,13.0,0,13,1,2,0,0.0,625.0,40.0,39
27351,46.0,4,11.0,0,13,1,2,0,0.0,625.0,40.0,39
17583,49.0,4,9.0,5,10,1,4,0,0.0,625.0,40.0,33
19653,68.0,4,9.0,2,14,4,4,1,0.0,419.0,12.0,39


In [52]:
candidate_dist

array([[  0.        ,  10.14889157,  11.61895004,  14.07124728,
        210.18087449]])

In [53]:
# Most similar to X but a different prediction

In [41]:
X_valid.iloc[0]

Age                39.0
Workclass           4.0
Education-Num      10.0
Marital Status      0.0
Occupation          4.0
Relationship        1.0
Race                4.0
Sex                 0.0
Capital Gain        0.0
Capital Loss      625.0
Hours per week     40.0
Country            39.0
Name: 18329, dtype: float64

In [54]:
query = [np.hstack([X_valid.iloc[0].values, not y_valid[0]])]

candidate_dist, candidate_idx = q_knn.kneighbors(query, n_neighbors=5, return_distance=True)

In [57]:
X_valid.iloc[candidate_idx.flatten()]

Unnamed: 0,Age,Workclass,Education-Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country
18329,39.0,4,10.0,0,4,1,4,0,0.0,625.0,40.0,39
14794,39.0,1,13.0,0,13,1,2,0,0.0,625.0,40.0,39
27351,46.0,4,11.0,0,13,1,2,0,0.0,625.0,40.0,39
17583,49.0,4,9.0,5,10,1,4,0,0.0,625.0,40.0,33
19653,68.0,4,9.0,2,14,4,4,1,0.0,419.0,12.0,39


In [56]:
candidate_dist

array([[  1.        ,  10.19803903,  11.66190379,  14.10673598,
        210.18325338]])

In [53]:
# Most similar to X but a different attribute(s) value(s)

In [41]:
X_valid.iloc[0]

Age                39.0
Workclass           4.0
Education-Num      10.0
Marital Status      0.0
Occupation          4.0
Relationship        1.0
Race                4.0
Sex                 0.0
Capital Gain        0.0
Capital Loss      625.0
Hours per week     40.0
Country            39.0
Name: 18329, dtype: float64

In [58]:
example = X_valid.iloc[0].copy()
example.Age = 50
example["Education-Num"] = 13

In [59]:
query = [np.hstack([example.values, y_valid[0]])]

candidate_dist, candidate_idx = q_knn.kneighbors(query, n_neighbors=5, return_distance=True)

In [60]:
X_valid.iloc[candidate_idx.flatten()]

Unnamed: 0,Age,Workclass,Education-Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country
27351,46.0,4,11.0,0,13,1,2,0,0.0,625.0,40.0,39
17583,49.0,4,9.0,5,10,1,4,0,0.0,625.0,40.0,33
18329,39.0,4,10.0,0,4,1,4,0,0.0,625.0,40.0,39
14794,39.0,1,13.0,0,13,1,2,0,0.0,625.0,40.0,39
19653,68.0,4,9.0,2,14,4,4,1,0.0,419.0,12.0,39


In [61]:
candidate_dist

array([[ 10.24695077,  10.67707825,  11.40175425,  14.6628783 ,
        208.98325292]])