In [121]:
import numpy as np
import pandas as pd
from dbfread import DBF

## Data Import

In [122]:
def read_dbf(filename: str) -> pd.DataFrame:
  return pd.DataFrame(DBF(filename))

In [123]:
# MEMBERS
df_members = read_dbf('./data/members.DBF')

# EXPEDITIONS
df_exped = read_dbf('./data/exped.DBF')

# PEAKS
df_peaks = read_dbf('./data/peaks.DBF')

## Data Cleaning

In [124]:
def standardize_colnames(df: pd.DataFrame) -> pd.DataFrame:
  df.columns = df.columns.str.lower()

### Members

In [125]:
df_members.head()

Unnamed: 0,EXPID,MEMBID,PEAKID,MYEAR,MSEASON,FNAME,LNAME,SEX,AGE,BIRTHDATE,...,MEMBERMEMO,NECROLOGY,MSMTBID,MSMTTERM,HCN,MCHKSUM,MSMTNOTE1,MSMTNOTE2,MSMTNOTE3,DEATHRTE
0,AMAD78301,1,AMAD,1978,3,Jean Robert,Clemenson,M,0,,...,,,1,4,0,2426937,,,,
1,AMAD78301,2,AMAD,1978,3,Bernard,Dufour,M,0,,...,,,1,4,0,2426501,,,,
2,AMAD78301,3,AMAD,1978,3,Philippe,Gerard,M,0,,...,,,1,4,0,2431569,,,,
3,AMAD78301,4,AMAD,1978,3,Eric,Lasserre,M,0,,...,,,1,4,0,2426809,,,,
4,AMAD78301,5,AMAD,1978,3,Guy,Peters,M,0,,...,,,1,4,0,2429215,,,,


In [126]:
standardize_colnames(df_members)

In [127]:
columns = list(df_members.columns)

In [128]:
col_list = [
  # KEY COLUMNS
  'expid',
  'membid',
  'peakid',
  # EXPEDITION TIME
  'myear',
  'mseason',
  # MEMBER PERSONAL DETAILS
  'fname',
  'lname',
  'sex',
  'yob',
  'calcage',
  'status',
  # MEMBER ROLE
  'leader',
  'deputy',
  'bconly',
  'nottobc',
  'support',
  'disabled',
  'hired',
  'sherpa',
  'tibetan',
  # EXPEDITION OUTCOME
  'msuccess',
  # EXPEDITION TYPE
  'msolo',
  'mtraverse',
  'mski',
  'mparapente',
  'mspeed',
  # SUMMIT BID DETAILS
  'mhighpt',
  'mperhighpt',
  'msmtdate1',
  'msmttime1',
  # EXPEDITION ROUTE/ASCENT
  # 'mroute1',
  # 'mroute2',
  # 'mroute3',
  # 'mascent1',
  # 'mascent2',
  # 'mascent3',
  # OXYGEN USE
  # 'mo2used',
  # 'mo2none',
  # 'mo2climb',
  # 'mo2descent',
  # 'mo2sleep',
  # 'mo2medical',
  # 'mo2note',
  # CLIMBER DEATH
  # 'death',
  # 'deathtype',
  # 'deathclass',
  # 'ams',
  # 'weather',
  # CLIMBER INJURY
  # 'injury',
  # 'injurytype',
  # SUMMIT BID
  'msmtbid',
  'msmtterm'
]

In [129]:
df_members = df_members[col_list]
df_members.rename(columns={'calcage': 'age'}, inplace=True)

In [130]:
df_members.shape

(87156, 32)

In [131]:
# exclude members with support roles
df_members = df_members.loc[
  (df_members.bconly == False) &
  (df_members.nottobc == False) &
  (df_members.support == False) &
  (df_members.hired == False) &
  (df_members.sherpa == False) &
  (df_members.tibetan == False)
]

df_members.drop(['bconly', 'nottobc', 'support', 'disabled', 'hired', 'sherpa', 'tibetan'], axis=1, inplace=True)

In [132]:
# exclude expeditions involving non-climbing activities
df_members = df_members.loc[
  (df_members.mtraverse == False) &
  (df_members.mski == False) &
  (df_members.mparapente == False) &
  (df_members.mspeed == False),
  :
]

df_members.drop(['mtraverse', 'mski', 'mparapente', 'mspeed'], axis=1, inplace=True)

In [133]:
# keep only members with leader/climber status
df_members.status = df_members.status.str.lower()

df_members = df_members.loc[
  (df_members.status.str.contains('climb')) |
  (df_members.status.str.contains('lead')),
  :
]

df_members = df_members.loc[
  (~df_members.status.str.contains('non-climber')) &
  (~df_members.status.str.contains('ski')) &
  (~df_members.status.str.contains('paraglider')) &
  (~df_members.status.str.contains('camera')) &
  (~df_members.status.str.contains('photo')) &
  (~df_members.status.str.contains('film')) &
  (~df_members.status.str.contains('reporter')) &
  (~df_members.status.str.contains('journalist')) &
  (~df_members.status.str.contains('coach')) &
  (~df_members.status.str.contains('advisor')) &
  (~df_members.status.str.contains('instructor')) &
  (~df_members.status.str.contains('support')) &
  (~df_members.status.str.contains('guide')) &
  (~df_members.status.str.contains('torch')) &
  (~df_members.status.str.contains('only'))
]

df_members.drop(['status'], axis=1, inplace=True)

In [134]:
# keep only expeditions since 2000
df_members.myear = df_members.myear.astype(int)
df_members = df_members.loc[df_members.myear >= 2000]

In [135]:
df_members.head()

Unnamed: 0,expid,membid,peakid,myear,mseason,fname,lname,sex,yob,age,leader,deputy,msuccess,msolo,mhighpt,mperhighpt,msmtdate1,msmttime1,msmtbid,msmtterm
27102,AMAD00105,4,AMAD,2000,1,Armando,Rubiella,M,1961,38,False,False,False,False,False,0,,,1,4
27281,CHOY00107,3,CHOY,2000,1,Hossein,Amani Shahry,M,1974,26,False,False,False,False,False,0,,,0,0
27463,EVER00105,4,EVER,2000,1,Fernando Jose,Fernandez-Vivancos Fernandez,M,1971,28,False,False,False,False,False,8200,2000-05-20,,4,3
27666,EVER00135,3,EVER,2000,1,Juichi,Kobayashi,M,1954,45,False,False,True,False,True,8849,2000-05-19,925.0,5,1
27758,EVER00148,5,EVER,2000,1,Rosa Maria,Real Soriano,F,1963,36,False,False,False,False,True,8700,2000-05-20,,4,3


In [136]:
df_members.shape

(24711, 20)

In [137]:
df_members.columns

Index(['expid', 'membid', 'peakid', 'myear', 'mseason', 'fname', 'lname',
       'sex', 'yob', 'age', 'leader', 'deputy', 'msuccess', 'msolo', 'mhighpt',
       'mperhighpt', 'msmtdate1', 'msmttime1', 'msmtbid', 'msmtterm'],
      dtype='object')

#### Data Checks

In [138]:
# a single entry per expedition/member combination
df_members.groupby(['expid', 'membid']).size().max() == 1

True

In [139]:
# member success does not match summit bid entry
df_members.loc[df_members.msuccess != (df_members.msmtbid == 5)]

Unnamed: 0,expid,membid,peakid,myear,mseason,fname,lname,sex,yob,age,leader,deputy,msuccess,msolo,mhighpt,mperhighpt,msmtdate1,msmttime1,msmtbid,msmtterm
45648,MANA08105,01,MANA,2008,1,Michael (Mick),Parker,M,1973,35,True,False,False,False,True,8125,2008-05-17,1000,5,2
45660,MANA08108,02,MANA,2008,1,Hiroko,Hiruma,F,1944,63,False,False,False,False,True,8125,2008-05-21,0930,5,2
45661,MANA08108,01,MANA,2008,1,Hiroyuki,Kuraoka,M,1961,46,True,False,False,False,True,8125,2008-05-21,0930,5,2
45662,MANA08108,03,MANA,2008,1,Kiyomi,Takiguchi,F,1948,59,False,False,False,False,True,8125,2008-05-21,0930,5,2
45663,MANA08108,04,MANA,2008,1,Yukiko,Tanaka,F,1960,47,False,False,False,False,True,8125,2008-05-21,0930,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85141,LOBE23101,02,LOBE,2023,1,Eva,Milovska,F,1982,40,False,False,False,False,True,6090,2023-03-23,1400,5,2
87119,ROLK23301,01,ROLK,2023,3,Yuri,Koshelenko,M,1963,60,False,False,False,False,True,6645,2023-10-23,,5,2
87120,ROLK23301,02,ROLK,2023,3,Aleksei,Lonchinskiy,M,1982,41,False,False,False,False,True,6645,2023-10-23,,5,2
87140,TENR23301,02,TENR,2023,3,Marek,Disman,M,1983,40,False,False,False,False,True,6900,2023-10-31,,5,2


#### Feature Engineering

In [140]:
# compute a proxy for climber experience: the cumulative count of expeditions the climber has been on
df_members = df_members.sort_values(['myear', 'mseason', 'msmtdate1', 'msmttime1']).reset_index(drop=True)
df_members['mexperience'] = df_members.groupby(['fname', 'lname', 'sex', 'yob'])['expid'].transform('cumcount')
df_members.drop(['fname', 'lname', 'yob', 'msmtdate1', 'msmttime1'], inplace=True, axis=1)

### Peaks

In [141]:
standardize_colnames(df_peaks)
df_peaks.head()

Unnamed: 0,peakid,pkname,pkname2,location,heightm,heightf,himal,region,open,unlisted,...,peakmemo,pyear,pseason,pexpid,psmtdate,pcountry,psummiters,psmtnote,refermemo,photomemo
0,AMAD,Ama Dablam,Amai Dablang,Khumbu Himal,6814,22356,12,2,True,False,...,"Other map altitudes:\r\n 6814m - HMG-MT, HMG...",1961,1,AMAD61101,Mar 13,"New Zealand, USA, UK","Mike Gill, Wally Romanes, Barry Bishop, Michae...",,,W Face (High 126:5 May 1993)\r\nSE Face (High ...
1,AMPG,Amphu Gyabjen,Amphu Gyabien,Khumbu Himal (N of Ama Dablam),5630,18471,12,2,True,False,...,"Other map altitudes:\r\n 5630m - HMG-Finn, N...",1953,1,AMPG53101,Apr 11,UK,"John Hunt, Tom Bourdillon",,,
2,ANN1,Annapurna I,,Annapurna Himal,8091,26545,1,5,True,False,...,"Other map altitudes:\r\n 8091m - HMG-MT, HMG...",1950,1,ANN150101,Jun 03,France,"Maurice Herzog, Louis Lachenal",,Dyhrenfurth history 1950-1977 (MM 58:44-47 Nov...,S Face (High 122:3 Jan 1993) (Beghin accident)...
3,ANN2,Annapurna II,,Annapurna Himal,7937,26040,1,5,True,False,...,"Other map altitudes:\r\n 7937m - HMG-MT, HMG...",1960,1,ANN260101,May 17,"UK, Nepal","Richard Grant, Chris Bonington, Ang Nyima Sherpa",,Dyhrenfurth history 1960-1976 (MM 51:36-37 Sep...,N Face (MM 51:36 Sep 1976)
4,ANN3,Annapurna III,,Annapurna Himal,7555,24787,1,5,True,False,...,"Other map altitudes:\r\n 7555m - HMG-MT, HMG...",1961,1,ANN361101,May 06,India,"Mohan S. Kohli, Sonam Gyatso, Sonam Girmi",,,S Side (MM 125:11 Jan 1989)\r\nSW Face (MM 71:...


In [None]:
col_list = [
  # ID COLUMN
  'peakid',
  # PEAK INFO
  'location',
  'heightm',
  'himal',
  'region',
  # PEAK STATUS
  'open',
  'unlisted',
  'trekking',
  # 'phost',
  'pstatus'
]

In [143]:
df_peaks = df_peaks[col_list]

In [144]:
df_peaks = df_peaks.loc[
  (df_peaks.trekking == False) &
  (df_peaks.open == True) &
  (df_peaks.unlisted == False)]
df_peaks.drop(['trekking', 'open', 'unlisted'], axis=1, inplace=True)

In [145]:
df_peaks.shape

(384, 7)

In [146]:
df_peaks.head()

Unnamed: 0,peakid,location,heightm,himal,region,phost,pstatus
0,AMAD,Khumbu Himal,6814,12,2,1,2
1,AMPG,Khumbu Himal (N of Ama Dablam),5630,12,2,1,2
2,ANN1,Annapurna Himal,8091,1,5,1,2
3,ANN2,Annapurna Himal,7937,1,5,1,2
4,ANN3,Annapurna Himal,7555,1,5,1,2


### Expeditions

In [147]:
standardize_colnames(df_exped)
df_exped.head()

Unnamed: 0,expid,peakid,year,season,host,route1,route2,route3,route4,nation,...,accidents,achievment,agency,comrte,stdrte,primrte,primmem,primref,primid,chksum
0,ANN260101,ANN2,1960,1,1,NW Ridge-W Ridge,,,,UK,...,,,,,,False,False,,,2442047
1,ANN269301,ANN2,1969,3,1,NW Ridge-W Ridge,,,,Yugoslavia,...,Draslar frostbitten hands and feet,,,,,False,False,,,2445501
2,ANN273101,ANN2,1973,1,1,W Ridge-N Face,,,,Japan,...,,,,,,False,False,,,2446797
3,ANN278301,ANN2,1978,3,1,N Face-W Ridge,,,,UK,...,,,,,,False,False,,,2448822
4,ANN279301,ANN2,1979,3,1,N Face-W Ridge,NW Ridge of A-IV,,,UK,...,,,,,,False,False,,,2449204


In [148]:
col_list = [
  # ID COLUMNS
  'expid',
  'peakid',
  # EXPEDICTION TIME
  'year',
  'season',
  'host',
  # EXPEDITION ROUTE
  'route1',
  # EXPEDITION INFO
  # 'sponsor',
  # EXPEDITION OUTCOME
  # 'success1', # success on route 1
  # 'success2',
  # 'success3',
  # 'success4',
  # 'ascent1', # ascent numbers for route 1
  'countries',
  # ASCENT DETAILS
  'approach',
  'termreason',
  # EXPEDITION TYPE
  'traverse',
  'ski',
  'parapente',
  # EQUIPMENT
  'camps',
  'rope',
  # TEAM SIZE
  'totmembers',
  'smtmembers',
  # 'mdeaths',
  'tothired',
  'smthired',
  # 'hdeaths',
  'nohired',
  # OXYGEN USE
  # 'o2used',
  # 'o2none',
  # 'o2climb',
  # 'o2descent',
  # 'o2sleep',
  # 'o2medical',
  # 'o2taken',
  # 'o2unkwn',
  # MISC
  'agency',
  # 'comrte',
  # 'stdrte',
  # 'primrte',
  # 'primmem',
  # 'primref'
]

In [149]:
df_exped = df_exped[col_list]

In [150]:
df_exped = df_exped.loc[
  (df_exped.traverse == False) &
  (df_exped.parapente == False) &
  (df_exped.ski == False)
]
df_exped.drop(['traverse', 'parapente', 'ski'], axis=1, inplace=True)

In [151]:
df_exped.year = df_exped.year.astype(int)

#### Feature Engineering

In [152]:
# TODO: compute agency experience

### Data Merge

In [153]:
df = df_members.merge(df_exped,
                left_on=['expid', 'peakid', 'myear', 'mseason'],
                right_on=['expid', 'peakid', 'year', 'season'],
                how='inner')

In [154]:
df = df.merge(df_peaks, on=['peakid'], how='inner')

In [155]:
df.shape

(24048, 37)

In [156]:
df.isna().sum().sum()

0

In [157]:
df.columns

Index(['expid', 'membid', 'peakid', 'myear', 'mseason', 'sex', 'age', 'leader',
       'deputy', 'msuccess', 'msolo', 'mhighpt', 'mperhighpt', 'msmtbid',
       'msmtterm', 'mexperience', 'year', 'season', 'host', 'route1',
       'countries', 'approach', 'termreason', 'camps', 'rope', 'totmembers',
       'smtmembers', 'tothired', 'smthired', 'nohired', 'agency', 'location',
       'heightm', 'himal', 'region', 'phost', 'pstatus'],
      dtype='object')

In [158]:
df.drop(['expid', 'membid', 'peakid'], axis=1, inplace=True)

In [159]:
df.host.value_counts()

host
1    20729
2     3306
0        8
3        5
Name: count, dtype: int64

In [161]:
df.phost.value_counts()

phost
1    13206
4    10465
5      377
Name: count, dtype: int64