In [258]:
import numpy as np
import pandas as pd
from dbfread import DBF

## Data Import

In [259]:
def read_dbf(filename: str) -> pd.DataFrame:
  return pd.DataFrame(DBF(filename))

In [260]:
# MEMBERS
df_members = read_dbf('./data/members.DBF')

# EXPEDITIONS
df_exped = read_dbf('./data/exped.DBF')

# PEAKS
df_peaks = read_dbf('./data/peaks.DBF')

## Data Cleaning

In [261]:
def standardize_colnames(df: pd.DataFrame) -> pd.DataFrame:
  df.columns = df.columns.str.lower()

### Members

In [262]:
df_members.head()

Unnamed: 0,EXPID,MEMBID,PEAKID,MYEAR,MSEASON,FNAME,LNAME,SEX,AGE,BIRTHDATE,...,MEMBERMEMO,NECROLOGY,MSMTBID,MSMTTERM,HCN,MCHKSUM,MSMTNOTE1,MSMTNOTE2,MSMTNOTE3,DEATHRTE
0,AMAD78301,1,AMAD,1978,3,Jean Robert,Clemenson,M,0,,...,,,1,4,0,2426937,,,,
1,AMAD78301,2,AMAD,1978,3,Bernard,Dufour,M,0,,...,,,1,4,0,2426501,,,,
2,AMAD78301,3,AMAD,1978,3,Philippe,Gerard,M,0,,...,,,1,4,0,2431569,,,,
3,AMAD78301,4,AMAD,1978,3,Eric,Lasserre,M,0,,...,,,1,4,0,2426809,,,,
4,AMAD78301,5,AMAD,1978,3,Guy,Peters,M,0,,...,,,1,4,0,2429215,,,,


In [263]:
standardize_colnames(df_members)

In [264]:
columns = list(df_members.columns)

In [265]:
col_list = [
  # KEY COLUMNS
  'expid',
  'membid',
  'peakid',
  # EXPEDITION TIME
  'myear',
  'mseason',
  # MEMBER PERSONAL DETAILS
  # 'fname',
  # 'lname',
  'sex',
  # 'age',
  # 'birthdate',
  # 'yob',
  'calcage',
  # 'citizen',
  'status',
  # 'residence',
  # 'occupation',
  # MEMBER ROLE
  'leader',
  'deputy',
  'bconly',
  'nottobc',
  'support',
  'disabled',
  'hired',
  'sherpa',
  'tibetan',
  # EXPEDITION OUTCOME
  'msuccess',
  # 'mclaimed',
  # 'mdisputed',
  # EXPEDITION TYPE
  'msolo',
  'mtraverse',
  'mski',
  'mparapente',
  'mspeed',
  # SUMMIT BID DETAILS
  'mhighpt',
  'mperhighpt',
  # 'msmtdate1',
  # 'msmtdate2',
  # 'msmtdate3',
  # 'msmttime1',
  # 'msmttime2',
  # 'msmttime3',
  # EXPEDITION ROUTE/ASCENT
  'mroute1',
  'mroute2',
  'mroute3',
  'mascent1',
  'mascent2',
  'mascent3',
  # OXYGEN USE
  'mo2used',
  'mo2none',
  'mo2climb',
  'mo2descent',
  'mo2sleep',
  'mo2medical',
  'mo2note',
  # CLIMBER DEATH
  'death',
  # 'deathdate',
  # 'deathtime',
  'deathtype',
  # 'deathhgtm',
  'deathclass',
  'ams',
  'weather',
  # CLIMBER INJURY
  'injury',
  # 'injurydate',
  # 'injurytime',
  'injurytype',
  # 'injuryhgtm',
  # SUMMIT BID
  'msmtbid',
  'msmtterm'
]

In [266]:
df_members = df_members[col_list]
df_members.rename(columns={'calcage': 'mage'}, inplace=True)

In [267]:
df_members.shape

(87156, 47)

In [268]:
df_members = df_members.loc[
  (df_members.bconly == False) &
  (df_members.nottobc == False) &
  (df_members.support == False) &
  (df_members.hired == False) &
  (df_members.sherpa == False) &
  (df_members.tibetan == False)
]

df_members.drop(['bconly', 'nottobc', 'support', 'disabled', 'hired', 'sherpa', 'tibetan'], axis=1, inplace=True)

In [269]:
df_members = df_members.loc[
  (df_members.mtraverse == False) &
  (df_members.mski == False) &
  (df_members.mparapente == False) &
  (df_members.mspeed == False),
  :
]

df_members.drop(['mtraverse', 'mski', 'mparapente', 'mspeed'], axis=1, inplace=True)

In [270]:
df_members.status = df_members.status.str.lower()

df_members = df_members.loc[
  (df_members.status.str.contains('climb')) |
  (df_members.status.str.contains('lead')),
  :
]

df_members = df_members.loc[
  (~df_members.status.str.contains('non-climber')) &
  (~df_members.status.str.contains('ski')) &
  (~df_members.status.str.contains('paraglider')) &
  (~df_members.status.str.contains('camera')) &
  (~df_members.status.str.contains('photo')) &
  (~df_members.status.str.contains('film')) &
  (~df_members.status.str.contains('reporter')) &
  (~df_members.status.str.contains('journalist')) &
  (~df_members.status.str.contains('coach')) &
  (~df_members.status.str.contains('advisor')) &
  (~df_members.status.str.contains('instructor')) &
  (~df_members.status.str.contains('support')) &
  (~df_members.status.str.contains('guide')) &
  (~df_members.status.str.contains('torch')) &
  (~df_members.status.str.contains('only'))
]

df_members.drop(['status'], axis=1, inplace=True)

In [271]:
df_members.head()

Unnamed: 0,expid,membid,peakid,myear,mseason,sex,mage,leader,deputy,msuccess,...,mo2note,death,deathtype,deathclass,ams,weather,injury,injurytype,msmtbid,msmtterm
56,AMAD79303,3,AMAD,1979,3,M,35,False,False,False,...,,False,0,0,False,False,False,0,1,14
192,AMAD85101,2,AMAD,1985,1,M,20,False,False,True,...,,False,0,0,False,False,False,0,5,1
209,AMAD85301,7,AMAD,1985,3,M,28,False,False,True,...,,False,0,0,False,False,False,0,5,1
213,AMAD85303,2,AMAD,1985,3,M,41,False,False,True,...,,False,0,0,False,False,False,0,5,1
458,AMAD90301,6,AMAD,1990,3,F,39,False,False,False,...,,False,0,0,False,False,False,0,1,4


In [272]:
df_members.shape

(25279, 35)

In [273]:
df_members.columns

Index(['expid', 'membid', 'peakid', 'myear', 'mseason', 'sex', 'mage',
       'leader', 'deputy', 'msuccess', 'msolo', 'mhighpt', 'mperhighpt',
       'mroute1', 'mroute2', 'mroute3', 'mascent1', 'mascent2', 'mascent3',
       'mo2used', 'mo2none', 'mo2climb', 'mo2descent', 'mo2sleep',
       'mo2medical', 'mo2note', 'death', 'deathtype', 'deathclass', 'ams',
       'weather', 'injury', 'injurytype', 'msmtbid', 'msmtterm'],
      dtype='object')

#### Data Checks

In [276]:
df_members.groupby(['expid', 'membid']).size().max() == 1

True

In [280]:
(df_members.msuccess == (df_members.msmtbid == 5)).sum()

24984

In [282]:
df_members.loc[df_members.msuccess == (df_members.msmtbid == 5)]

Unnamed: 0,expid,membid,peakid,myear,mseason,sex,mage,leader,deputy,msuccess,...,mo2note,death,deathtype,deathclass,ams,weather,injury,injurytype,msmtbid,msmtterm
56,AMAD79303,03,AMAD,1979,3,M,35,False,False,False,...,,False,0,0,False,False,False,0,1,14
192,AMAD85101,02,AMAD,1985,1,M,20,False,False,True,...,,False,0,0,False,False,False,0,5,1
209,AMAD85301,07,AMAD,1985,3,M,28,False,False,True,...,,False,0,0,False,False,False,0,5,1
213,AMAD85303,02,AMAD,1985,3,M,41,False,False,True,...,,False,0,0,False,False,False,0,5,1
458,AMAD90301,06,AMAD,1990,3,F,39,False,False,False,...,,False,0,0,False,False,False,0,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87150,TUKU23301,05,TUKU,2023,3,F,47,False,False,False,...,,False,0,0,False,False,False,0,1,3
87151,TUKU23301,06,TUKU,2023,3,M,65,False,False,False,...,,False,0,0,False,False,False,0,1,3
87152,TUKU23301,07,TUKU,2023,3,M,31,False,False,False,...,,False,0,0,False,False,False,0,1,3
87153,TUKU23301,08,TUKU,2023,3,M,64,False,False,False,...,,False,0,0,False,False,False,0,1,3


### Peaks

In [154]:
df_peaks.shape

(479, 25)

In [155]:
standardize_colnames(df_peaks)

In [None]:
columns = list(df_peaks.columns)
columns

In [156]:
col_list = [
  # ID COLUMN
  'peakid',
  # PEAK INFO
  'location',
  'heightm',
  'himal',
  'region',
  # PEAK STATUS
  'open',
  'unlisted',
  'trekking',
  'phost',
  'pstatus'
]

In [157]:
df_peaks = df_peaks[col_list]

In [158]:
df_peaks = df_peaks.loc[df_peaks.trekking == False]
df_peaks.drop('trekking', axis=1, inplace=True)

In [159]:
df_peaks.shape

(448, 9)

In [160]:
df_peaks.head()

Unnamed: 0,peakid,location,heightm,himal,region,open,unlisted,phost,pstatus
0,AMAD,Khumbu Himal,6814,12,2,True,False,1,2
1,AMPG,Khumbu Himal (N of Ama Dablam),5630,12,2,True,False,1,2
2,ANN1,Annapurna Himal,8091,1,5,True,False,1,2
3,ANN2,Annapurna Himal,7937,1,5,True,False,1,2
4,ANN3,Annapurna Himal,7555,1,5,True,False,1,2


### Expeditions

In [161]:
standardize_colnames(df_exped)

In [162]:
df_exped.head()

Unnamed: 0,expid,peakid,year,season,host,route1,route2,route3,route4,nation,...,accidents,achievment,agency,comrte,stdrte,primrte,primmem,primref,primid,chksum
0,ANN260101,ANN2,1960,1,1,NW Ridge-W Ridge,,,,UK,...,,,,,,False,False,,,2442047
1,ANN269301,ANN2,1969,3,1,NW Ridge-W Ridge,,,,Yugoslavia,...,Draslar frostbitten hands and feet,,,,,False,False,,,2445501
2,ANN273101,ANN2,1973,1,1,W Ridge-N Face,,,,Japan,...,,,,,,False,False,,,2446797
3,ANN278301,ANN2,1978,3,1,N Face-W Ridge,,,,UK,...,,,,,,False,False,,,2448822
4,ANN279301,ANN2,1979,3,1,N Face-W Ridge,NW Ridge of A-IV,,,UK,...,,,,,,False,False,,,2449204


In [163]:
columns = list(df_exped.columns)

In [164]:
columns

['expid',
 'peakid',
 'year',
 'season',
 'host',
 'route1',
 'route2',
 'route3',
 'route4',
 'nation',
 'leaders',
 'sponsor',
 'success1',
 'success2',
 'success3',
 'success4',
 'ascent1',
 'ascent2',
 'ascent3',
 'ascent4',
 'claimed',
 'disputed',
 'countries',
 'approach',
 'bcdate',
 'smtdate',
 'smttime',
 'smtdays',
 'totdays',
 'termdate',
 'termreason',
 'termnote',
 'highpoint',
 'traverse',
 'ski',
 'parapente',
 'camps',
 'rope',
 'totmembers',
 'smtmembers',
 'mdeaths',
 'tothired',
 'smthired',
 'hdeaths',
 'nohired',
 'o2used',
 'o2none',
 'o2climb',
 'o2descent',
 'o2sleep',
 'o2medical',
 'o2taken',
 'o2unkwn',
 'othersmts',
 'campsites',
 'routememo',
 'accidents',
 'achievment',
 'agency',
 'comrte',
 'stdrte',
 'primrte',
 'primmem',
 'primref',
 'primid',
 'chksum']

In [165]:
col_list = [
  # ID COLUMNS
  'expid',
  'peakid',
  # EXPEDITION TIME & PLACE
  'year',
  'season',
  'host',
  # EXPEDITION ROUTE
  'route1',
  'route2',
  'route3',
  'route4',
  'nation',
  'leaders',
  'sponsor',
  # EXPEDITION OUTCOME
  'success1',
  'success2',
  'success3',
  'success4',
  'ascent1',
  'ascent2',
  'ascent3',
  'ascent4',
  'claimed',
  'disputed',
  'countries',
  'approach',
  # KEY DATES
  'bcdate',
  'smtdate',
  'smttime',
  'smtdays',
  'totdays',
  'termdate',
  # TERMINATION
  'termreason',
  'termnote',
  'highpoint',
  # EXPEDITION TYPE
  'traverse',
  'ski',
  'parapente',
  # EQUIPMENT
  'camps',
  'rope',
  # TEAM SIZE
  'totmembers',
  'smtmembers',
  'mdeaths',
  'tothired',
  'smthired',
  'hdeaths',
  'nohired',
  # OXYGEN USE
  'o2used',
  'o2none',
  'o2climb',
  'o2descent',
  'o2sleep',
  'o2medical',
  'o2taken',
  'o2unkwn',
  
  'othersmts',
  'campsites',
  'routememo',
  'accidents',
  'achievment',
  'agency',
  'comrte',
  'stdrte',
  'primrte',
  'primmem',
  'primref',
  'primid',
  'chksum'
]

In [166]:
len(col_list)

66

In [167]:
df_exped.shape[1]

66