In [279]:
import numpy as np
import pandas as pd
from dbfread import DBF

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import  mutual_info_score, confusion_matrix

## Data Import

In [280]:
def read_dbf(filename: str) -> pd.DataFrame:
  return pd.DataFrame(DBF(filename))

In [281]:
# MEMBERS
df_members = read_dbf('./data/members.DBF')

# EXPEDITIONS
df_exped = read_dbf('./data/exped.DBF')

# PEAKS
df_peaks = read_dbf('./data/peaks.DBF')

## Data Cleaning

In [282]:
def standardize_colnames(df: pd.DataFrame) -> pd.DataFrame:
  df.columns = df.columns.str.lower()

### Members

In [283]:
df_members.head()

Unnamed: 0,EXPID,MEMBID,PEAKID,MYEAR,MSEASON,FNAME,LNAME,SEX,AGE,BIRTHDATE,...,MEMBERMEMO,NECROLOGY,MSMTBID,MSMTTERM,HCN,MCHKSUM,MSMTNOTE1,MSMTNOTE2,MSMTNOTE3,DEATHRTE
0,AMAD78301,1,AMAD,1978,3,Jean Robert,Clemenson,M,0,,...,,,1,4,0,2426937,,,,
1,AMAD78301,2,AMAD,1978,3,Bernard,Dufour,M,0,,...,,,1,4,0,2426501,,,,
2,AMAD78301,3,AMAD,1978,3,Philippe,Gerard,M,0,,...,,,1,4,0,2431569,,,,
3,AMAD78301,4,AMAD,1978,3,Eric,Lasserre,M,0,,...,,,1,4,0,2426809,,,,
4,AMAD78301,5,AMAD,1978,3,Guy,Peters,M,0,,...,,,1,4,0,2429215,,,,


In [284]:
standardize_colnames(df_members)

In [285]:
col_list = [
  # KEY COLUMNS
  'expid',
  'membid',
  'peakid',
  # EXPEDITION TIME
  'myear',
  'mseason',
  # MEMBER PERSONAL DETAILS
  'fname',
  'lname',
  'sex',
  'yob',
  'calcage',
  'status',
  # MEMBER ROLE
  'leader',
  'deputy',
  'bconly',
  'nottobc',
  'support',
  'disabled',
  'hired',
  'sherpa',
  'tibetan',
  # EXPEDITION OUTCOME
  'msuccess',
  # EXPEDITION TYPE
  'msolo',
  'mtraverse',
  'mski',
  'mparapente',
  'mspeed',
  # SUMMIT BID DETAILS
  # 'mperhighpt',
  'msmtdate1',
  'msmttime1',
  # SUMMIT BID
  'msmtbid',
  'msmtterm'
]

In [286]:
df_members = df_members[col_list]
df_members.rename(columns={'calcage': 'age'}, inplace=True)

In [287]:
df_members.shape

(87156, 30)

In [288]:
# 11 – O2 system failure
# 14 – Assisting, guiding, supporting or accompanying others
# 15 – Route/camp preparation or rope fixing
# 17 – Did not climb or intent to summit
df_members = df_members.loc[~df_members.msmtterm.isin([11, 14, 15, 17]), :]
df_members.drop('msmtterm', axis=1, inplace=True)

In [289]:
# exclude members with support roles
df_members = df_members.loc[
  (df_members.bconly == False) &
  (df_members.nottobc == False) &
  (df_members.support == False) &
  (df_members.hired == False) &
  (df_members.sherpa == False) &
  (df_members.tibetan == False)
]

df_members.drop(['bconly', 'nottobc', 'support', 'disabled', 'hired', 'sherpa', 'tibetan'], axis=1, inplace=True)

In [290]:
# exclude expeditions involving non-climbing activities
df_members = df_members.loc[
  (df_members.mtraverse == False) &
  (df_members.mski == False) &
  (df_members.mparapente == False) &
  (df_members.mspeed == False),
  :
]

df_members.drop(['mtraverse', 'mski', 'mparapente', 'mspeed'], axis=1, inplace=True)

In [291]:
# keep only members with leader/climber status
df_members.status = df_members.status.str.lower()

df_members = df_members.loc[
  (df_members.status.str.contains('climb')) |
  (df_members.status.str.contains('lead')),
  :
]

df_members = df_members.loc[
  (~df_members.status.str.contains('non-climber')) &
  (~df_members.status.str.contains('ski')) &
  (~df_members.status.str.contains('paraglider')) &
  (~df_members.status.str.contains('camera')) &
  (~df_members.status.str.contains('photo')) &
  (~df_members.status.str.contains('film')) &
  (~df_members.status.str.contains('reporter')) &
  (~df_members.status.str.contains('journalist')) &
  (~df_members.status.str.contains('coach')) &
  (~df_members.status.str.contains('advisor')) &
  (~df_members.status.str.contains('instructor')) &
  (~df_members.status.str.contains('support')) &
  (~df_members.status.str.contains('guide')) &
  (~df_members.status.str.contains('torch')) &
  (~df_members.status.str.contains('only'))
]

df_members.drop(['status'], axis=1, inplace=True)

In [292]:
df_members = df_members.loc[df_members.sex.isin(['M', 'F']), :]
# df_members['gender_male'] = df_members.sex == 'M'
# df_members.drop('sex', axis=1, inplace=True)

In [293]:
df_members.head()

Unnamed: 0,expid,membid,peakid,myear,mseason,fname,lname,sex,yob,age,leader,deputy,msuccess,msolo,msmtdate1,msmttime1,msmtbid
192,AMAD85101,2,AMAD,1985,1,Carlo,Alde,M,1964,20,False,False,True,False,1985-04-23,,5
209,AMAD85301,7,AMAD,1985,3,Hermann,Comploj,M,1957,28,False,False,True,False,1985-11-03,,5
213,AMAD85303,2,AMAD,1985,3,Maximilian-Horst,Fankhauser,M,1944,41,False,False,True,False,1985-10-31,,5
458,AMAD90301,6,AMAD,1990,3,Martha,Deflorin,F,1951,39,False,False,False,False,,,1
489,AMAD90307,2,AMAD,1990,3,David,Auble,M,1959,31,False,False,False,False,1990-11-12,,4


In [294]:
df_members.shape

(24255, 17)

In [295]:
df_members.columns

Index(['expid', 'membid', 'peakid', 'myear', 'mseason', 'fname', 'lname',
       'sex', 'yob', 'age', 'leader', 'deputy', 'msuccess', 'msolo',
       'msmtdate1', 'msmttime1', 'msmtbid'],
      dtype='object')

#### Data Checks

In [296]:
# a single entry per expedition/member combination
df_members.groupby(['expid', 'membid']).size().max() == 1

True

In [297]:
# member success does not match summit bid entry
df_members.loc[df_members.msuccess != (df_members.msmtbid == 5)]
df_members.drop('msmtbid', axis=1, inplace=True)

#### Feature Engineering

In [298]:
# compute a proxy for climber experience: the cumulative count of expeditions the climber has been on
df_members = df_members.sort_values(['myear', 'mseason', 'msmtdate1', 'msmttime1']).reset_index(drop=True)
df_members['member_experience'] = df_members.groupby(['fname', 'lname', 'sex', 'yob'])['expid'].transform('cumcount')
df_members.drop(['fname', 'lname', 'yob', 'msmtdate1', 'msmttime1'], inplace=True, axis=1)

In [299]:
# keep only expeditions since 2000
df_members.myear = df_members.myear.astype(int)
df_members = df_members.loc[df_members.myear >= 2000]

### Peaks

In [300]:
standardize_colnames(df_peaks)
df_peaks.head()

Unnamed: 0,peakid,pkname,pkname2,location,heightm,heightf,himal,region,open,unlisted,...,peakmemo,pyear,pseason,pexpid,psmtdate,pcountry,psummiters,psmtnote,refermemo,photomemo
0,AMAD,Ama Dablam,Amai Dablang,Khumbu Himal,6814,22356,12,2,True,False,...,"Other map altitudes:\r\n 6814m - HMG-MT, HMG...",1961,1,AMAD61101,Mar 13,"New Zealand, USA, UK","Mike Gill, Wally Romanes, Barry Bishop, Michae...",,,W Face (High 126:5 May 1993)\r\nSE Face (High ...
1,AMPG,Amphu Gyabjen,Amphu Gyabien,Khumbu Himal (N of Ama Dablam),5630,18471,12,2,True,False,...,"Other map altitudes:\r\n 5630m - HMG-Finn, N...",1953,1,AMPG53101,Apr 11,UK,"John Hunt, Tom Bourdillon",,,
2,ANN1,Annapurna I,,Annapurna Himal,8091,26545,1,5,True,False,...,"Other map altitudes:\r\n 8091m - HMG-MT, HMG...",1950,1,ANN150101,Jun 03,France,"Maurice Herzog, Louis Lachenal",,Dyhrenfurth history 1950-1977 (MM 58:44-47 Nov...,S Face (High 122:3 Jan 1993) (Beghin accident)...
3,ANN2,Annapurna II,,Annapurna Himal,7937,26040,1,5,True,False,...,"Other map altitudes:\r\n 7937m - HMG-MT, HMG...",1960,1,ANN260101,May 17,"UK, Nepal","Richard Grant, Chris Bonington, Ang Nyima Sherpa",,Dyhrenfurth history 1960-1976 (MM 51:36-37 Sep...,N Face (MM 51:36 Sep 1976)
4,ANN3,Annapurna III,,Annapurna Himal,7555,24787,1,5,True,False,...,"Other map altitudes:\r\n 7555m - HMG-MT, HMG...",1961,1,ANN361101,May 06,India,"Mohan S. Kohli, Sonam Gyatso, Sonam Girmi",,,S Side (MM 125:11 Jan 1989)\r\nSW Face (MM 71:...


In [301]:
columns = df_peaks.columns
print(columns)

Index(['peakid', 'pkname', 'pkname2', 'location', 'heightm', 'heightf',
       'himal', 'region', 'open', 'unlisted', 'trekking', 'trekyear',
       'restrict', 'phost', 'pstatus', 'peakmemo', 'pyear', 'pseason',
       'pexpid', 'psmtdate', 'pcountry', 'psummiters', 'psmtnote', 'refermemo',
       'photomemo'],
      dtype='object')


In [302]:
col_list = [
  'peakid',
  'heightm',
  'himal',
  'region',
  'open', # Peak open
  'unlisted', # Peak unlisted
  'trekking', # Trekking peak
  'pstatus', # Peak climbing status
  # 'pyear' # First ascent year 
  # 'pseason', # First ascent season
]

In [303]:
df_peaks = df_peaks[col_list]

In [304]:
df_peaks = df_peaks.loc[
  (df_peaks.trekking == False) &
  (df_peaks.open == True) &
  (df_peaks.unlisted == False)]
df_peaks.drop(['trekking', 'open', 'unlisted'], axis=1, inplace=True)

In [305]:
df_peaks.himal = df_peaks.himal.astype(str)
df_peaks.region = df_peaks.region.astype(str)

df_peaks.climbed = df_peaks.pstatus == 2
df_peaks.drop('pstatus', axis=1, inplace=True)

  df_peaks.climbed = df_peaks.pstatus == 2


In [306]:
df_peaks.shape

(384, 4)

In [307]:
df_peaks.head()

Unnamed: 0,peakid,heightm,himal,region
0,AMAD,6814,12,2
1,AMPG,5630,12,2
2,ANN1,8091,1,5
3,ANN2,7937,1,5
4,ANN3,7555,1,5


### Expeditions

In [308]:
standardize_colnames(df_exped)
df_exped.head()

Unnamed: 0,expid,peakid,year,season,host,route1,route2,route3,route4,nation,...,accidents,achievment,agency,comrte,stdrte,primrte,primmem,primref,primid,chksum
0,ANN260101,ANN2,1960,1,1,NW Ridge-W Ridge,,,,UK,...,,,,,,False,False,,,2442047
1,ANN269301,ANN2,1969,3,1,NW Ridge-W Ridge,,,,Yugoslavia,...,Draslar frostbitten hands and feet,,,,,False,False,,,2445501
2,ANN273101,ANN2,1973,1,1,W Ridge-N Face,,,,Japan,...,,,,,,False,False,,,2446797
3,ANN278301,ANN2,1978,3,1,N Face-W Ridge,,,,UK,...,,,,,,False,False,,,2448822
4,ANN279301,ANN2,1979,3,1,N Face-W Ridge,NW Ridge of A-IV,,,UK,...,,,,,,False,False,,,2449204


In [309]:
col_list = [
  # ID COLUMNS
  'expid',
  'peakid',
  # EXPEDICTION TIME
  'year',
  'season',
  'bcdate',
  'smtdate',
  'smttime',
  'termdate',
  'host',
  # ASCENT DETAILS
  'termreason',
  # EXPEDITION TYPE
  'traverse',
  'ski',
  'parapente',
  # EQUIPMENT
  'camps',
  'rope',
  # TEAM SIZE
  'totmembers',
  'smtmembers',
  # 'mdeaths',
  'tothired',
  'smthired',
  # 'hdeaths',
  'nohired',
  # MISC
  'agency'
]

In [310]:
df_exped = df_exped[col_list]

In [311]:
# 12 – Did not attempt climb
# 13 – Attempt rumored 
df_exped = df_exped.loc[~df_exped.termreason.isin([12, 13]), :]
df_exped.drop('termreason', axis=1, inplace=True)

In [312]:
df_exped = df_exped.loc[
  (df_exped.traverse == False) &
  (df_exped.parapente == False) &
  (df_exped.ski == False)
]
df_exped.drop(['traverse', 'parapente', 'ski'], axis=1, inplace=True)

In [313]:
df_exped.host = df_exped.host.astype(str)

#### Feature Engineering

In [314]:
# compute agency experience
df_exped.sort_values(['year', 'season', 'bcdate', 'smtdate', 'smttime', 'termdate'], inplace=True)
df_exped['agency_experience'] = df_exped.groupby('agency')['expid'].transform('cumcount')
df_exped.drop(['bcdate', 'smtdate', 'smttime', 'termdate', 'agency'], axis=1, inplace=True)

In [315]:
df_exped.year = df_exped.year.astype(int)
df_exped = df_exped.loc[df_exped.year >= 2000]

In [316]:
df_exped.head()

Unnamed: 0,expid,peakid,year,season,host,camps,rope,totmembers,smtmembers,tothired,smthired,nohired,agency_experience
3589,JANU00101,JANU,2000,1,1,2,0,3,2,0,0,True,0
3506,BARU00102,BARU,2000,1,1,2,0,5,5,4,4,False,15
3537,EVER00105,EVER,2000,1,1,4,0,9,3,6,0,False,5
3607,MANA00101,MANA,2000,1,1,3,400,4,3,0,0,True,10
3495,AMAD00111,AMAD,2000,1,1,2,0,6,0,0,0,True,289


### Data Merge

In [317]:
df = df_members.merge(df_exped,
                left_on=['expid', 'peakid', 'myear', 'mseason'],
                right_on=['expid', 'peakid', 'year', 'season'],
                how='inner')
df.drop(['myear', 'mseason', 'year'], axis=1, inplace=True)

In [318]:
df = df.merge(df_peaks, on=['peakid'], how='inner')

In [319]:
df.head()

Unnamed: 0,expid,membid,peakid,sex,age,leader,deputy,msuccess,msolo,member_experience,...,rope,totmembers,smtmembers,tothired,smthired,nohired,agency_experience,heightm,himal,region
0,HCHI00101,1,HCHI,M,63,True,False,False,False,0,...,0,4,0,3,0,False,107,7029,12,2
1,HCHI00101,2,HCHI,M,50,False,False,False,False,0,...,0,4,0,3,0,False,107,7029,12,2
2,HCHI00101,3,HCHI,M,56,False,False,False,False,0,...,0,4,0,3,0,False,107,7029,12,2
3,HCHI00101,4,HCHI,M,60,False,False,False,False,0,...,0,4,0,3,0,False,107,7029,12,2
4,DANG00101,1,DANG,M,65,True,False,False,False,0,...,0,4,0,1,0,False,1331,6355,6,1


In [320]:
df.drop(['expid', 'membid', 'peakid'], axis=1, inplace=True)

In [321]:
df.dtypes

sex                  object
age                   int64
leader                 bool
deputy                 bool
msuccess               bool
msolo                  bool
member_experience     int64
season                int64
host                 object
camps                 int64
rope                  int64
totmembers            int64
smtmembers            int64
tothired              int64
smthired              int64
nohired                bool
agency_experience     int64
heightm               int64
himal                object
region               object
dtype: object

## Exploratory Data Analysis
### Dataset balance

In [322]:
(df.msuccess.value_counts() / df.shape[0]).round(2)

msuccess
False    0.53
True     0.47
Name: count, dtype: float64

The binary target variable is well balanced in the dataset, with a .53/.47 split.

### Feature Importance

In [323]:
global_success_rate = round(df.msuccess.mean(), 2)
df.groupby(['sex']).msuccess.agg(['mean', 'count']) / global_success_rate

Unnamed: 0_level_0,mean,count
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,1.065935,8600.0
M,0.975663,40408.510638


The database is highly imbalanced with respect to gender, with nearly 5 times as many male climbers as female climbers. Female drivers have a relatively higher success rate.

In [324]:
df.groupby(['leader']).msuccess.agg(['mean', 'count']) / global_success_rate

Unnamed: 0_level_0,mean,count
leader,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.97416,40034.042553
True,1.068874,8974.468085


Expedition members with the status of 'leader' have a relatively higher success rate

In [325]:
from IPython.display import display

In [326]:
categorical_vars = df.select_dtypes(exclude='number')
for c in categorical_vars:
  df_group = df.groupby(c).msuccess.agg(['mean', 'count'])
  df_group['diff'] = df_group['mean'] - global_success_rate
  df_group['likelihood'] = df_group['mean'] / global_success_rate
  display(df_group)
  print()

Unnamed: 0_level_0,mean,count,diff,likelihood
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F,0.50099,4042,0.03099,1.065935
M,0.458561,18992,-0.011439,0.975663





Unnamed: 0_level_0,mean,count,diff,likelihood
leader,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,0.457855,18816,-0.012145,0.97416
True,0.502371,4218,0.032371,1.068874





Unnamed: 0_level_0,mean,count,diff,likelihood
deputy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,0.465131,22828,-0.004869,0.989639
True,0.563107,206,0.093107,1.1981





Unnamed: 0_level_0,mean,count,diff,likelihood
msuccess,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,0.0,12300,-0.47,0.0
True,1.0,10734,0.53,2.12766





Unnamed: 0_level_0,mean,count,diff,likelihood
msolo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,0.465765,23003,-0.004235,0.99099
True,0.645161,31,0.175161,1.372684





Unnamed: 0_level_0,mean,count,diff,likelihood
host,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.125,8,-0.345,0.265957
1,0.456809,19842,-0.013191,0.971934
2,0.524693,3179,0.054693,1.116369
3,0.2,5,-0.27,0.425532





Unnamed: 0_level_0,mean,count,diff,likelihood
nohired,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,0.484478,19714,0.014478,1.030804
True,0.356325,3320,-0.113675,0.758139





Unnamed: 0_level_0,mean,count,diff,likelihood
himal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.296896,741,-0.173104,0.631694
10,0.507937,63,0.037937,1.080716
11,0.5,44,0.03,1.06383
12,0.518105,14416,0.048105,1.102351
13,0.380165,121,-0.089835,0.808862
14,0.361732,716,-0.108268,0.769642
15,0.429402,2925,-0.040598,0.913621
16,0.323353,167,-0.146647,0.687986
17,0.355769,104,-0.114231,0.756956
18,0.345376,1384,-0.124624,0.734842





Unnamed: 0_level_0,mean,count,diff,likelihood
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.459357,529,-0.010643,0.977356
2,0.50901,15428,0.03901,1.082999
3,0.354651,172,-0.115349,0.754577
4,0.42336,2988,-0.04664,0.900766
5,0.348257,2323,-0.121743,0.740971
6,0.306231,1316,-0.163769,0.651555
7,0.359712,278,-0.110288,0.765345





### Mutual Information

In [327]:
def compute_success_mutual_info(series: pd.Series) -> float:
  return mutual_info_score(df.msuccess, series)

In [328]:
mutual_scores = df.drop('msuccess', axis=1).select_dtypes(exclude='number').apply(compute_success_mutual_info)
mutual_scores.sort_values(ascending=False)

himal      0.013210
region     0.009326
nohired    0.004135
host       0.001225
leader     0.000595
sex        0.000522
deputy     0.000171
msolo      0.000087
dtype: float64

### Correlation

In [329]:
def compute_success_correlation(series):
  return series.corrwith(df.msuccess)

In [330]:
corrs = df.select_dtypes('number').corrwith(df.msuccess)
corrs.sort_values(ascending=False)

smtmembers           0.411841
smthired             0.313223
camps                0.202339
tothired             0.143533
heightm              0.119157
member_experience    0.058556
totmembers           0.056964
agency_experience   -0.020643
rope                -0.058642
season              -0.075616
age                 -0.122294
dtype: float64

In [360]:
(df.camps / df.heightm).corr(df.msuccess)

0.1869802373704067

## Feature Preparation
### Validation Framework

In [331]:
def split_dataset(df: pd.DataFrame, target_var: str):
  df = df.copy()

  df_train_full, df_test = train_test_split(df.drop(target_var, axis=1), test_size=.2, random_state=42)
  df_train, df_val = train_test_split(df_train_full, test_size=.25, random_state=42)

  y_train = df.loc[df_train.index, target_var].reset_index(drop=True)
  y_val = df.loc[df_val.index, target_var].reset_index(drop=True)
  y_test = df.loc[df_test.index, target_var].reset_index(drop=True)

  df_train.reset_index(drop=True, inplace=True)
  df_val.reset_index(drop=True, inplace=True)
  df_test.reset_index(drop=True, inplace=True)

  return (df_train, y_train), (df_val, y_val), (df_test, y_test)

In [332]:
(df_train, y_train), (df_val, y_val), (df_test, y_test) = split_dataset(df, 'msuccess')

In [333]:
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(13820, 19)
(4607, 19)
(4607, 19)


### Variable Encoding

In [334]:
def encode_categorical_vars(df: pd.DataFrame, encoder: OneHotEncoder = None) -> pd.DataFrame:
  df = df.copy()

  # split datatypes
  df_categorical = df.select_dtypes(exclude='number')
  df_numerical = df.select_dtypes(include='number')

  # create and fit a new encoder if not supplied
  if not encoder:
    encoder = OneHotEncoder(sparse_output=False, dtype=np.int64, handle_unknown='infrequent_if_exist')
    encoder.fit(df_categorical)

  # perform one-hot-encoding on the categorical variables
  df_categorical_encoded = pd.DataFrame(
    data=encoder.transform(df_categorical),
    columns=encoder.get_feature_names_out()
  )

  # merge back the encoded variables with the numerical variables
  df_encoded = df_numerical.merge(df_categorical_encoded, left_index=True, right_index=True)

  return df_encoded, encoder

In [335]:
X_train, encoder = encode_categorical_vars(df_train)
X_val, _ = encode_categorical_vars(df_val, encoder)
X_test, _ = encode_categorical_vars(df_test, encoder)

In [336]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(13820, 52)
(4607, 52)
(4607, 52)


## Model Training
### Decision Tree
#### Base Model

In [337]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [338]:
y_pred = model.predict(X_val)

In [339]:
confusion_matrix(y_val, y_pred)

array([[1985,  520],
       [ 524, 1578]])