In [270]:
import numpy as np
import pandas as pd
from dbfread import DBF

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

## Data Import

In [271]:
def read_dbf(filename: str) -> pd.DataFrame:
  return pd.DataFrame(DBF(filename))

In [272]:
# MEMBERS
df_members = read_dbf('./data/members.DBF')

# EXPEDITIONS
df_exped = read_dbf('./data/exped.DBF')

# PEAKS
df_peaks = read_dbf('./data/peaks.DBF')

## Data Cleaning

In [273]:
def standardize_colnames(df: pd.DataFrame) -> pd.DataFrame:
  df.columns = df.columns.str.lower()

### Members

In [274]:
df_members.head()

Unnamed: 0,EXPID,MEMBID,PEAKID,MYEAR,MSEASON,FNAME,LNAME,SEX,AGE,BIRTHDATE,...,MEMBERMEMO,NECROLOGY,MSMTBID,MSMTTERM,HCN,MCHKSUM,MSMTNOTE1,MSMTNOTE2,MSMTNOTE3,DEATHRTE
0,AMAD78301,1,AMAD,1978,3,Jean Robert,Clemenson,M,0,,...,,,1,4,0,2426937,,,,
1,AMAD78301,2,AMAD,1978,3,Bernard,Dufour,M,0,,...,,,1,4,0,2426501,,,,
2,AMAD78301,3,AMAD,1978,3,Philippe,Gerard,M,0,,...,,,1,4,0,2431569,,,,
3,AMAD78301,4,AMAD,1978,3,Eric,Lasserre,M,0,,...,,,1,4,0,2426809,,,,
4,AMAD78301,5,AMAD,1978,3,Guy,Peters,M,0,,...,,,1,4,0,2429215,,,,


In [275]:
standardize_colnames(df_members)

In [276]:
col_list = [
  # KEY COLUMNS
  'expid',
  'membid',
  'peakid',
  # EXPEDITION TIME
  'myear',
  'mseason',
  # MEMBER PERSONAL DETAILS
  'fname',
  'lname',
  'sex',
  'yob',
  'calcage',
  'status',
  # MEMBER ROLE
  'leader',
  'deputy',
  'bconly',
  'nottobc',
  'support',
  'disabled',
  'hired',
  'sherpa',
  'tibetan',
  # EXPEDITION OUTCOME
  'msuccess',
  # EXPEDITION TYPE
  'msolo',
  'mtraverse',
  'mski',
  'mparapente',
  'mspeed',
  # SUMMIT BID DETAILS
  'mperhighpt',
  'msmtdate1',
  'msmttime1',
  # EXPEDITION ROUTE/ASCENT
  # 'mroute1',
  # 'mroute2',
  # 'mroute3',
  # 'mascent1',
  # 'mascent2',
  # 'mascent3',
  # OXYGEN USE
  # 'mo2used',
  # 'mo2none',
  # 'mo2climb',
  # 'mo2descent',
  # 'mo2sleep',
  # 'mo2medical',
  # 'mo2note',
  # CLIMBER DEATH
  # 'death',
  # 'deathtype',
  # 'deathclass',
  # 'ams',
  # 'weather',
  # CLIMBER INJURY
  # 'injury',
  # 'injurytype',
  # SUMMIT BID
  'msmtbid',
  'msmtterm'
]

In [277]:
df_members = df_members[col_list]
df_members.rename(columns={'calcage': 'age'}, inplace=True)

In [278]:
df_members.shape

(87156, 31)

In [279]:
# 11 – O2 system failure
# 14 – Assisting, guiding, supporting or accompanying others
# 15 – Route/camp preparation or rope fixing
# 17 – Did not climb or intent to summit
df_members = df_members.loc[~df_members.msmtterm.isin([11, 14, 15, 17]), :]
df_members.drop('msmtterm', axis=1, inplace=True)

In [280]:
# exclude members with support roles
df_members = df_members.loc[
  (df_members.bconly == False) &
  (df_members.nottobc == False) &
  (df_members.support == False) &
  (df_members.hired == False) &
  (df_members.sherpa == False) &
  (df_members.tibetan == False)
]

df_members.drop(['bconly', 'nottobc', 'support', 'disabled', 'hired', 'sherpa', 'tibetan'], axis=1, inplace=True)

In [281]:
# exclude expeditions involving non-climbing activities
df_members = df_members.loc[
  (df_members.mtraverse == False) &
  (df_members.mski == False) &
  (df_members.mparapente == False) &
  (df_members.mspeed == False),
  :
]

df_members.drop(['mtraverse', 'mski', 'mparapente', 'mspeed'], axis=1, inplace=True)

In [282]:
# keep only members with leader/climber status
df_members.status = df_members.status.str.lower()

df_members = df_members.loc[
  (df_members.status.str.contains('climb')) |
  (df_members.status.str.contains('lead')),
  :
]

df_members = df_members.loc[
  (~df_members.status.str.contains('non-climber')) &
  (~df_members.status.str.contains('ski')) &
  (~df_members.status.str.contains('paraglider')) &
  (~df_members.status.str.contains('camera')) &
  (~df_members.status.str.contains('photo')) &
  (~df_members.status.str.contains('film')) &
  (~df_members.status.str.contains('reporter')) &
  (~df_members.status.str.contains('journalist')) &
  (~df_members.status.str.contains('coach')) &
  (~df_members.status.str.contains('advisor')) &
  (~df_members.status.str.contains('instructor')) &
  (~df_members.status.str.contains('support')) &
  (~df_members.status.str.contains('guide')) &
  (~df_members.status.str.contains('torch')) &
  (~df_members.status.str.contains('only'))
]

df_members.drop(['status'], axis=1, inplace=True)

In [283]:
df_members = df_members.loc[df_members.sex.isin(['M', 'F']), :]
df_members['gender_male'] = df_members.sex == 'M'
df_members.drop('sex', axis=1, inplace=True)

In [284]:
df_members.head()

Unnamed: 0,expid,membid,peakid,myear,mseason,fname,lname,yob,age,leader,deputy,msuccess,msolo,mperhighpt,msmtdate1,msmttime1,msmtbid,gender_male
192,AMAD85101,2,AMAD,1985,1,Carlo,Alde,1964,20,False,False,True,False,6814,1985-04-23,,5,True
209,AMAD85301,7,AMAD,1985,3,Hermann,Comploj,1957,28,False,False,True,False,6814,1985-11-03,,5,True
213,AMAD85303,2,AMAD,1985,3,Maximilian-Horst,Fankhauser,1944,41,False,False,True,False,6814,1985-10-31,,5,True
458,AMAD90301,6,AMAD,1990,3,Martha,Deflorin,1951,39,False,False,False,False,0,,,1,False
489,AMAD90307,2,AMAD,1990,3,David,Auble,1959,31,False,False,False,False,6600,1990-11-12,,4,True


In [285]:
df_members.shape

(24255, 18)

In [286]:
df_members.columns

Index(['expid', 'membid', 'peakid', 'myear', 'mseason', 'fname', 'lname',
       'yob', 'age', 'leader', 'deputy', 'msuccess', 'msolo', 'mperhighpt',
       'msmtdate1', 'msmttime1', 'msmtbid', 'gender_male'],
      dtype='object')

#### Data Checks

In [287]:
# a single entry per expedition/member combination
df_members.groupby(['expid', 'membid']).size().max() == 1

True

In [288]:
# member success does not match summit bid entry
df_members.loc[df_members.msuccess != (df_members.msmtbid == 5)]
df_members.drop('msmtbid', axis=1, inplace=True)

#### Feature Engineering

In [289]:
# compute a proxy for climber experience: the cumulative count of expeditions the climber has been on
df_members = df_members.sort_values(['myear', 'mseason', 'msmtdate1', 'msmttime1']).reset_index(drop=True)
df_members['member_experience'] = df_members.groupby(['fname', 'lname', 'gender_male', 'yob'])['expid'].transform('cumcount')
df_members.drop(['fname', 'lname', 'yob', 'msmtdate1', 'msmttime1'], inplace=True, axis=1)

In [290]:
# keep only expeditions since 2000
df_members.myear = df_members.myear.astype(int)
df_members = df_members.loc[df_members.myear >= 2000]

### Peaks

In [291]:
standardize_colnames(df_peaks)
df_peaks.head()

Unnamed: 0,peakid,pkname,pkname2,location,heightm,heightf,himal,region,open,unlisted,...,peakmemo,pyear,pseason,pexpid,psmtdate,pcountry,psummiters,psmtnote,refermemo,photomemo
0,AMAD,Ama Dablam,Amai Dablang,Khumbu Himal,6814,22356,12,2,True,False,...,"Other map altitudes:\r\n 6814m - HMG-MT, HMG...",1961,1,AMAD61101,Mar 13,"New Zealand, USA, UK","Mike Gill, Wally Romanes, Barry Bishop, Michae...",,,W Face (High 126:5 May 1993)\r\nSE Face (High ...
1,AMPG,Amphu Gyabjen,Amphu Gyabien,Khumbu Himal (N of Ama Dablam),5630,18471,12,2,True,False,...,"Other map altitudes:\r\n 5630m - HMG-Finn, N...",1953,1,AMPG53101,Apr 11,UK,"John Hunt, Tom Bourdillon",,,
2,ANN1,Annapurna I,,Annapurna Himal,8091,26545,1,5,True,False,...,"Other map altitudes:\r\n 8091m - HMG-MT, HMG...",1950,1,ANN150101,Jun 03,France,"Maurice Herzog, Louis Lachenal",,Dyhrenfurth history 1950-1977 (MM 58:44-47 Nov...,S Face (High 122:3 Jan 1993) (Beghin accident)...
3,ANN2,Annapurna II,,Annapurna Himal,7937,26040,1,5,True,False,...,"Other map altitudes:\r\n 7937m - HMG-MT, HMG...",1960,1,ANN260101,May 17,"UK, Nepal","Richard Grant, Chris Bonington, Ang Nyima Sherpa",,Dyhrenfurth history 1960-1976 (MM 51:36-37 Sep...,N Face (MM 51:36 Sep 1976)
4,ANN3,Annapurna III,,Annapurna Himal,7555,24787,1,5,True,False,...,"Other map altitudes:\r\n 7555m - HMG-MT, HMG...",1961,1,ANN361101,May 06,India,"Mohan S. Kohli, Sonam Gyatso, Sonam Girmi",,,S Side (MM 125:11 Jan 1989)\r\nSW Face (MM 71:...


In [292]:
columns = df_peaks.columns
print(columns)

Index(['peakid', 'pkname', 'pkname2', 'location', 'heightm', 'heightf',
       'himal', 'region', 'open', 'unlisted', 'trekking', 'trekyear',
       'restrict', 'phost', 'pstatus', 'peakmemo', 'pyear', 'pseason',
       'pexpid', 'psmtdate', 'pcountry', 'psummiters', 'psmtnote', 'refermemo',
       'photomemo'],
      dtype='object')


In [293]:
col_list = [
  'peakid',
  'heightm',
  'himal',
  'region',
  'open', # Peak open
  'unlisted', # Peak unlisted
  'trekking', # Trekking peak
  'pstatus', # Peak climbing status
  'pyear', # First ascent year 
  'pseason', # First ascent season
]

In [294]:
df_peaks = df_peaks[col_list]

In [295]:
df_peaks = df_peaks.loc[
  (df_peaks.trekking == False) &
  (df_peaks.open == True) &
  (df_peaks.unlisted == False)]
df_peaks.drop(['trekking', 'open', 'unlisted'], axis=1, inplace=True)

In [296]:
df_peaks.himal = df_peaks.himal.astype(str)
df_peaks.region = df_peaks.region.astype(str)
df_peaks.pseason = df_peaks.pseason.astype(str)

df_peaks.climbed = df_peaks.pstatus == 2
df_peaks.drop('pstatus', axis=1, inplace=True)

  df_peaks.climbed = df_peaks.pstatus == 2


In [297]:
df_peaks.shape

(384, 6)

In [298]:
df_peaks.head()

Unnamed: 0,peakid,heightm,himal,region,pyear,pseason
0,AMAD,6814,12,2,1961,1
1,AMPG,5630,12,2,1953,1
2,ANN1,8091,1,5,1950,1
3,ANN2,7937,1,5,1960,1
4,ANN3,7555,1,5,1961,1


### Expeditions

In [299]:
standardize_colnames(df_exped)
df_exped.head()

Unnamed: 0,expid,peakid,year,season,host,route1,route2,route3,route4,nation,...,accidents,achievment,agency,comrte,stdrte,primrte,primmem,primref,primid,chksum
0,ANN260101,ANN2,1960,1,1,NW Ridge-W Ridge,,,,UK,...,,,,,,False,False,,,2442047
1,ANN269301,ANN2,1969,3,1,NW Ridge-W Ridge,,,,Yugoslavia,...,Draslar frostbitten hands and feet,,,,,False,False,,,2445501
2,ANN273101,ANN2,1973,1,1,W Ridge-N Face,,,,Japan,...,,,,,,False,False,,,2446797
3,ANN278301,ANN2,1978,3,1,N Face-W Ridge,,,,UK,...,,,,,,False,False,,,2448822
4,ANN279301,ANN2,1979,3,1,N Face-W Ridge,NW Ridge of A-IV,,,UK,...,,,,,,False,False,,,2449204


In [300]:
col_list = [
  # ID COLUMNS
  'expid',
  'peakid',
  # EXPEDICTION TIME
  'year',
  'season',
  'bcdate',
  'smtdate',
  'smttime',
  'termdate',
  'host',
  # ASCENT DETAILS
  'termreason',
  # EXPEDITION TYPE
  'traverse',
  'ski',
  'parapente',
  # EQUIPMENT
  'camps',
  'rope',
  # TEAM SIZE
  'totmembers',
  'smtmembers',
  # 'mdeaths',
  'tothired',
  'smthired',
  # 'hdeaths',
  'nohired',
  # MISC
  'agency'
]

In [301]:
df_exped = df_exped[col_list]

In [302]:
# 12 – Did not attempt climb
# 13 – Attempt rumored 
df_exped = df_exped.loc[~df_exped.termreason.isin([12, 13]), :]
df_exped.drop('termreason', axis=1, inplace=True)

In [303]:
df_exped = df_exped.loc[
  (df_exped.traverse == False) &
  (df_exped.parapente == False) &
  (df_exped.ski == False)
]
df_exped.drop(['traverse', 'parapente', 'ski'], axis=1, inplace=True)

In [304]:
df_exped.host = df_exped.host.astype(str)

#### Feature Engineering

In [305]:
# compute agency experience
df_exped.sort_values(['year', 'season', 'bcdate', 'smtdate', 'smttime', 'termdate'], inplace=True)
df_exped['agency_experience'] = df_exped.groupby('agency')['expid'].transform('cumcount')
df_exped.drop(['bcdate', 'smtdate', 'smttime', 'termdate', 'agency'], axis=1, inplace=True)

In [306]:
df_exped.year = df_exped.year.astype(int)
df_exped = df_exped.loc[df_exped.year >= 2000]

In [307]:
df_exped.head()

Unnamed: 0,expid,peakid,year,season,host,camps,rope,totmembers,smtmembers,tothired,smthired,nohired,agency_experience
3589,JANU00101,JANU,2000,1,1,2,0,3,2,0,0,True,0
3506,BARU00102,BARU,2000,1,1,2,0,5,5,4,4,False,15
3537,EVER00105,EVER,2000,1,1,4,0,9,3,6,0,False,5
3607,MANA00101,MANA,2000,1,1,3,400,4,3,0,0,True,10
3495,AMAD00111,AMAD,2000,1,1,2,0,6,0,0,0,True,289


### Data Merge

In [308]:
df = df_members.merge(df_exped,
                left_on=['expid', 'peakid', 'myear', 'mseason'],
                right_on=['expid', 'peakid', 'year', 'season'],
                how='inner')
df.drop(['myear', 'mseason'], axis=1, inplace=True)

In [309]:
df = df.merge(df_peaks, on=['peakid'], how='inner')

In [310]:
df.head()

Unnamed: 0,expid,membid,peakid,age,leader,deputy,msuccess,msolo,mperhighpt,gender_male,...,smtmembers,tothired,smthired,nohired,agency_experience,heightm,himal,region,pyear,pseason
0,HCHI00101,1,HCHI,63,True,False,False,False,6283,True,...,0,3,0,False,107,7029,12,2,2003,1
1,HCHI00101,2,HCHI,50,False,False,False,False,6283,True,...,0,3,0,False,107,7029,12,2,2003,1
2,HCHI00101,3,HCHI,56,False,False,False,False,6283,True,...,0,3,0,False,107,7029,12,2,2003,1
3,HCHI00101,4,HCHI,60,False,False,False,False,6283,True,...,0,3,0,False,107,7029,12,2,2003,1
4,DANG00101,1,DANG,65,True,False,False,False,6194,True,...,0,1,0,False,1331,6355,6,1,2002,3


In [311]:
df.drop(['expid', 'membid', 'peakid'], axis=1, inplace=True)

In [312]:
df.msuccess.value_counts()

msuccess
False    12300
True     10734
Name: count, dtype: int64

## Feature Preparation
### Validation Framework

In [313]:
def split_dataset(df: pd.DataFrame, target_var: str):
  df = df.copy()

  df_train_full, df_test = train_test_split(df.drop(target_var, axis=1), test_size=.2, random_state=42)
  df_train, df_val = train_test_split(df_train_full, test_size=.25, random_state=42)

  y_train = df.loc[df_train.index, target_var].reset_index(drop=True)
  y_val = df.loc[df_val.index, target_var].reset_index(drop=True)
  y_test = df.loc[df_test.index, target_var].reset_index(drop=True)

  df_train.reset_index(drop=True, inplace=True)
  df_val.reset_index(drop=True, inplace=True)
  df_test.reset_index(drop=True, inplace=True)

  return (df_train, y_train), (df_val, y_val), (df_test, y_test)

In [314]:
(df_train, y_train), (df_val, y_val), (df_test, y_test) = split_dataset(df, 'msuccess')

In [315]:
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(13820, 23)
(4607, 23)
(4607, 23)


### Variable Encoding

In [316]:
def encode_categorical_vars(df: pd.DataFrame, encoder: OneHotEncoder = None) -> pd.DataFrame:
  df = df.copy()

  df_categorical = df.select_dtypes(exclude='number')
  df_numerical = df.select_dtypes(include='number')

  if not encoder:
    encoder = OneHotEncoder(sparse_output=False, dtype=np.int64, handle_unknown='infrequent_if_exist')
    encoder.fit(df_categorical)

  df_categorical_encoded = pd.DataFrame(
    data=encoder.transform(df_categorical),
    columns=encoder.get_feature_names_out()
  )

  df_encoded = df_numerical.merge(df_categorical_encoded, left_index=True, right_index=True)

  return df_encoded, encoder

In [317]:
X_train, encoder = encode_categorical_vars(df_train)
X_val, _ = encode_categorical_vars(df_val, encoder)
X_test, _ = encode_categorical_vars(df_test, encoder)

In [318]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(13820, 119)
(4607, 119)
(4607, 119)
