In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
%matplotlib inline

In [9]:
# Many extra parameters available for handling particular aspects of the .csv input file
df = pd.read_csv("./titanic_data/train.csv")

In [10]:
# Provides dtype and missing value info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [11]:
# Provides stats summary for numeric columns
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [12]:
df.index

RangeIndex(start=0, stop=891, step=1)

In [13]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [14]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
def to_categorical(df, cat_fts=df.columns):
    """
    Takes a dataframe and a list of columns and returns
    a copy of such dataframe with those columns transformed
    into categorical type
    """
    df_out = df.copy()
    for ft in cat_fts:
        df_out[ft] = df_out[ft].astype("category")
        
    return df_out

In [7]:
df_cat = to_categorical(df)

In [24]:
df_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 10 columns):
age             272 non-null category
mefalsepause    272 non-null category
tumor-size      272 non-null category
inv-falsedes    272 non-null category
falsede-caps    264 non-null category
deg-malig       272 non-null category
breast          272 non-null category
breast-quad     271 non-null category
irradiat        272 non-null category
class           272 non-null category
dtypes: category(10)
memory usage: 4.5 KB


In [16]:
# Fill missing values with the mode since there are very few of them
df_cat["falsede-caps"].fillna(value=df_cat["falsede-caps"].mode()[0], inplace=True)
df_cat["breast-quad"].fillna(value=df_cat["breast-quad"].mode()[0], inplace=True)

In [17]:
df_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 10 columns):
age             272 non-null category
mefalsepause    272 non-null category
tumor-size      272 non-null category
inv-falsedes    272 non-null category
falsede-caps    272 non-null category
deg-malig       272 non-null category
breast          272 non-null category
breast-quad     272 non-null category
irradiat        272 non-null category
class           272 non-null category
dtypes: category(10)
memory usage: 4.5 KB


In [25]:
def dummify(df, cat_fts=df.columns):
    df_out = df.copy()
    df_out = pd.get_dummies(df_out, columns=cat_fts, drop_first=True)
    return df_out

In [26]:
df_dum = dummify(df_cat)

In [31]:
df_dum.describe()

Unnamed: 0,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,mefalsepause_lt40,mefalsepause_premefalse,tumor-size_10-14,tumor-size_15-19,tumor-size_20-24,...,falsede-caps_True,deg-malig_2,deg-malig_3,breast_right,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up,irradiat_True,class_recurrence-events
count,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,...,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0
mean,0.125,0.319853,0.334559,0.194853,0.022059,0.025735,0.525735,0.095588,0.095588,0.172794,...,0.202206,0.455882,0.301471,0.474265,0.378676,0.338235,0.088235,0.117647,0.246324,0.297794
std,0.331329,0.467279,0.472706,0.396818,0.147146,0.158636,0.500258,0.294568,0.294568,0.378766,...,0.402385,0.498968,0.459742,0.500258,0.485951,0.473981,0.28416,0.322784,0.431663,0.458131
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [34]:
print(df_dum.shape)

(272, 33)


In [32]:
X = df_dum.drop("class_recurrence-events", axis=1)
y = df_dum.loc[:, "class_recurrence-events"]

In [33]:
print(X.shape)
print(y.shape)

(272, 32)
(272,)


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

In [86]:
sc = PowerTransformer()

In [87]:
sc.fit(X_train)

PowerTransformer(copy=True, method='yeo-johnson', standardize=True)

In [88]:
X_train_scaled = sc.transform(X_train)
X_test_scaled = sc.transform(X_test)

In [89]:
clf = LogisticRegression()

In [90]:
clf.fit(X_train_scaled,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [91]:
clf.score(X_test_scaled,y_test)

0.6727272727272727

In [92]:
from sklearn.metrics import confusion_matrix, f1_score, auc, roc_curve

In [93]:
y_pred = clf.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

array([[31,  8],
       [10,  6]])

In [94]:
f1_score(y_test, y_pred)

0.39999999999999997

In [95]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
auc(fpr, tpr)

0.5849358974358975

In [96]:
X.columns

Index(['age_30-39', 'age_40-49', 'age_50-59', 'age_60-69', 'age_70-79',
       'mefalsepause_lt40', 'mefalsepause_premefalse', 'tumor-size_10-14',
       'tumor-size_15-19', 'tumor-size_20-24', 'tumor-size_25-29',
       'tumor-size_30-34', 'tumor-size_35-39', 'tumor-size_40-44',
       'tumor-size_45-49', 'tumor-size_5-9', 'tumor-size_50-54',
       'inv-falsedes_12-14', 'inv-falsedes_15-17', 'inv-falsedes_24-26',
       'inv-falsedes_3-5', 'inv-falsedes_6-8', 'inv-falsedes_9-11',
       'falsede-caps_True', 'deg-malig_2', 'deg-malig_3', 'breast_right',
       'breast-quad_left_low', 'breast-quad_left_up', 'breast-quad_right_low',
       'breast-quad_right_up', 'irradiat_True'],
      dtype='object')

In [82]:
clf.coef_

array([[ 1.67357061e-01, -8.10166277e-03, -1.52418458e-01,
         2.90113935e-01, -2.29230737e-02,  1.75250857e-01,
         2.04276099e-01, -8.72461859e-01, -2.28095502e-02,
         1.32767705e-01,  1.64827546e-02,  1.19981442e-01,
        -9.19121434e-02, -3.66689719e-02, -7.35957124e-02,
        -2.97320210e-01,  1.70468983e-01,  3.10698417e-02,
         9.78168250e-02,  2.05110126e-01,  3.05142446e-01,
         1.20860437e-01,  1.68205595e-01,  2.16504599e-01,
        -3.75059211e-01,  3.49253217e-01, -2.34436356e-01,
         2.22113482e-01, -2.22403363e-04, -4.93623886e-02,
         1.89386686e-01,  2.23708100e-01]])

In [5]:
import json

In [6]:
with open("./config.json") as f:
    data = json.load(f)

In [7]:
data

{'name': 'ivaa_validate',
 'type': 'mlafterdrools',
 'group': 'After Drools Validation',
 'category': 'document',
 'subcategory': '',
 'UseCase': 'Validate documents after drools',
 'Schedule': 'n/a',
 'Comments': 'builds a binary classifier for IVAA',
 'BusinessApprovalChecklist': {'What Document Types is does this model affect?': 'IVAA'}}

In [8]:
df_json = pd.DataFrame(data=data)

In [9]:
df_json.head()

Unnamed: 0,name,type,group,category,subcategory,UseCase,Schedule,Comments,BusinessApprovalChecklist
What Document Types is does this model affect?,ivaa_validate,mlafterdrools,After Drools Validation,document,,Validate documents after drools,,builds a binary classifier for IVAA,IVAA
