# PARiS Classifier

## Development Dataset

Load the data and the weights:

In [None]:
from shutil import copyfile
copyfile(src = "../input/fairness.py", dst = "../working/fairness.py")
import pandas as pd

In [None]:
import pickle
from fairness import read_skills

SEED = 0
WEIGHTS = pickle.load(open("../input/PARiS.pickle", "rb"))
all_skills = read_skills("../input/skills.txt")
target = "Interview"
predictors = all_skills
demographics = ["Veteran", "Female", "URM", "Disability"]
data = pd.read_csv("../input/resumes_development.csv", index_col=0)
data.head()

View the correlations between demographic features and the target variable:

In [None]:
data[[target] + demographics].corr()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from fairness import PARiSClassifier, evaluate_model, rank_models

In [None]:
d_train, d_test = train_test_split(data, test_size=0.5, stratify=data[target], shuffle=True, random_state=SEED)
X_train = d_train[predictors]
y_train = d_train[target]
X_test = d_test[predictors]
y_test = d_test[target]
print("Train: N = {0}, P(Interview) = {1:.5f}".format(len(X_train), y_train.mean()))
print("Test:  N = {0}, P(Interview) = {1:.5f}".format(len(X_test), y_test.mean()))

How well would a logistic regression separate the interviewed and rejected applicants?

In [None]:
logres = LogisticRegression(solver="liblinear", penalty="l2", fit_intercept=True)
logres.fit(X_train, y_train)
evaluate_model(y_test, logres.predict(X_test))

That's pretty good But the PARiS system can do an even better job:

In [None]:
paris = PARiSClassifier(WEIGHTS)
paris.fit(X_train, y_train)
evaluate_model(y_test, paris.predict(X_test))

Compare other models to PARiS:

In [None]:
models = []
models.append(PARiSClassifier(WEIGHTS))
models.append(LogisticRegression(solver="liblinear", penalty="l2", fit_intercept=True))
models.append(DecisionTreeClassifier())
models.append(KNeighborsClassifier(n_neighbors=3))
print("{} models".format(len(models)))

In [None]:
rdf, cols, clfs = rank_models(models, d_train, y_train, d_test, y_test, predictors, demographics)

In [None]:
rdf[cols].sort_values(by="F1", ascending=False).round(3)

## Pilot Dataset

Load the data:

In [None]:
pilot = pd.read_csv("../input/resumes_pilot.csv", index_col=0)
pilot.head()

Check the correlations:

In [None]:
pilot[[target] + demographics].corr()

In [None]:
Compare the models on the pilot data:

In [None]:
y_pilot = pilot[target]
d_pilot = pilot[predictors + demographics]
rdf, cols, clfs = rank_models(models, d_train, y_train, d_pilot, y_pilot, predictors, demographics)

In [None]:
rdf[cols].sort_values(by="F1", ascending=False).round(3)

Inspect the false negatives produced by PARiS:

In [None]:
from fairness import unvectorize

pclf = clfs[0]
for i, (yt, pa, x) in enumerate(zip(pilot[target], pclf.predict_proba(pilot[predictors])[:,1], pilot.values)):
    if yt == 1 and pa < pclf.threshold:
        print("Applicant {0}, P(I|X) = {1:.3f}".format(i, pa))
        skills = unvectorize(pilot.columns, x)
        print("{}".format(", ".join(skills)))
        print()

**Hypothesis:** Many employees have a history of playing sports. None of these applicants marked as false negative had athletics on their resume. Perhaps the model is biased against this.

There are six features related to sports, and all six are positively correlated with getting an interview in our training dataset:

In [None]:
sports = [
    "Basketball",
    "Football",
    "Baseball",
    "Swimming",
    "Soccer",
    "Diving"
]
data[[target] + sports].corr()

PARiS also has positive weights for alll six sports features:

In [None]:
sports_idx = [predictors.index(sport) for sport in sports]
sports_idx

In [None]:
pd.DataFrame(WEIGHTS[0][sports_idx], sports)