In [125]:
import numpy as np
import pandas as pd
from dbfread import DBF

## Data Import

In [126]:
def read_dbf(filename: str) -> pd.DataFrame:
  return pd.DataFrame(DBF(filename))

In [127]:
# MEMBERS
df_members = read_dbf('./data/members.DBF')

# EXPEDITIONS
df_exped = read_dbf('./data/exped.DBF')

# PEAKS
df_peaks = read_dbf('./data/peaks.DBF')

## Data Cleaning
### Members

In [128]:
df_members.shape

(87156, 78)

In [129]:
df_members.head()

Unnamed: 0,EXPID,MEMBID,PEAKID,MYEAR,MSEASON,FNAME,LNAME,SEX,AGE,BIRTHDATE,...,MEMBERMEMO,NECROLOGY,MSMTBID,MSMTTERM,HCN,MCHKSUM,MSMTNOTE1,MSMTNOTE2,MSMTNOTE3,DEATHRTE
0,AMAD78301,1,AMAD,1978,3,Jean Robert,Clemenson,M,0,,...,,,1,4,0,2426937,,,,
1,AMAD78301,2,AMAD,1978,3,Bernard,Dufour,M,0,,...,,,1,4,0,2426501,,,,
2,AMAD78301,3,AMAD,1978,3,Philippe,Gerard,M,0,,...,,,1,4,0,2431569,,,,
3,AMAD78301,4,AMAD,1978,3,Eric,Lasserre,M,0,,...,,,1,4,0,2426809,,,,
4,AMAD78301,5,AMAD,1978,3,Guy,Peters,M,0,,...,,,1,4,0,2429215,,,,


In [130]:
df_members.columns = df_members.columns.str.lower()

In [131]:
for col in df_members.columns:
  print(col)

expid
membid
peakid
myear
mseason
fname
lname
sex
age
birthdate
yob
calcage
citizen
status
residence
occupation
leader
deputy
bconly
nottobc
support
disabled
hired
sherpa
tibetan
msuccess
mclaimed
mdisputed
msolo
mtraverse
mski
mparapente
mspeed
mhighpt
mperhighpt
msmtdate1
msmtdate2
msmtdate3
msmttime1
msmttime2
msmttime3
mroute1
mroute2
mroute3
mascent1
mascent2
mascent3
mo2used
mo2none
mo2climb
mo2descent
mo2sleep
mo2medical
mo2note
death
deathdate
deathtime
deathtype
deathhgtm
deathclass
ams
weather
injury
injurydate
injurytime
injurytype
injuryhgtm
deathnote
membermemo
necrology
msmtbid
msmtterm
hcn
mchksum
msmtnote1
msmtnote2
msmtnote3
deathrte


In [132]:
col_list = [
  # key columns
  'expid',
  'membid',
  'peakid',
  # expedition time
  'myear',
  'mseason',
  # member info
  'sex',
  'calcage',
  'status',
  'occupation',
  'leader',
  'deputy',
  'bconly',
  'nottobc',
  'support',
  'disabled',
  'hired',
  'sherpa',
  'tibetan',
  # expedition outcome
  'msuccess',
  'mclaimed',
  'mdisputed',
  # expedition info
  'msolo',
  'mtraverse',
  'mski',
  'mparapente',
  'mspeed',
  # expedition route
  'mroute1',
  'mroute2',
  'mroute3',
  'mascent1',
  'mascent2',
  'mascent3',
  # oxygen use
  'mo2used',
  'mo2none',
  'mo2climb',
  'mo2descent',
  'mo2sleep',
  'mo2medical',
  'mo2note',
  # notes
  'msmtnote1',
  'msmtnote2',
  'msmtnote3'
]

In [133]:
df_members = df_members[col_list]

In [134]:
df_members.shape

(87156, 42)

In [135]:
df_members = df_members.loc[
  (df_members.bconly == False) &
  (df_members.nottobc == False) &
  (df_members.support == False) &
  (df_members.disabled == False) &
  (df_members.hired == False) &
  (df_members.mtraverse == False) &
  (df_members.mski == False) &
  (df_members.mparapente == False) &
  (df_members.tibetan == False) &
  (df_members.sherpa == False), :]

In [136]:
df_members.drop(
  ['bconly', 'nottobc', 'support', 'disabled', 'hired', 'mtraverse', 'mski', 'mparapente', 'tibetan', 'sherpa'],
  axis=1, inplace=True)

In [137]:
df_members.status = df_members.status.str.lower()

In [138]:
df_members.columns

Index(['expid', 'membid', 'peakid', 'myear', 'mseason', 'sex', 'calcage',
       'status', 'occupation', 'leader', 'deputy', 'msuccess', 'mclaimed',
       'mdisputed', 'msolo', 'mspeed', 'mroute1', 'mroute2', 'mroute3',
       'mascent1', 'mascent2', 'mascent3', 'mo2used', 'mo2none', 'mo2climb',
       'mo2descent', 'mo2sleep', 'mo2medical', 'mo2note', 'msmtnote1',
       'msmtnote2', 'msmtnote3'],
      dtype='object')

In [139]:
df_members.msuccess.value_counts()

msuccess
False    38557
True     21184
Name: count, dtype: int64

## Peaks

In [140]:
df_peaks.shape

(479, 25)

In [141]:
df_peaks.head()

Unnamed: 0,PEAKID,PKNAME,PKNAME2,LOCATION,HEIGHTM,HEIGHTF,HIMAL,REGION,OPEN,UNLISTED,...,PEAKMEMO,PYEAR,PSEASON,PEXPID,PSMTDATE,PCOUNTRY,PSUMMITERS,PSMTNOTE,REFERMEMO,PHOTOMEMO
0,AMAD,Ama Dablam,Amai Dablang,Khumbu Himal,6814,22356,12,2,True,False,...,"Other map altitudes:\r\n 6814m - HMG-MT, HMG...",1961,1,AMAD61101,Mar 13,"New Zealand, USA, UK","Mike Gill, Wally Romanes, Barry Bishop, Michae...",,,W Face (High 126:5 May 1993)\r\nSE Face (High ...
1,AMPG,Amphu Gyabjen,Amphu Gyabien,Khumbu Himal (N of Ama Dablam),5630,18471,12,2,True,False,...,"Other map altitudes:\r\n 5630m - HMG-Finn, N...",1953,1,AMPG53101,Apr 11,UK,"John Hunt, Tom Bourdillon",,,
2,ANN1,Annapurna I,,Annapurna Himal,8091,26545,1,5,True,False,...,"Other map altitudes:\r\n 8091m - HMG-MT, HMG...",1950,1,ANN150101,Jun 03,France,"Maurice Herzog, Louis Lachenal",,Dyhrenfurth history 1950-1977 (MM 58:44-47 Nov...,S Face (High 122:3 Jan 1993) (Beghin accident)...
3,ANN2,Annapurna II,,Annapurna Himal,7937,26040,1,5,True,False,...,"Other map altitudes:\r\n 7937m - HMG-MT, HMG...",1960,1,ANN260101,May 17,"UK, Nepal","Richard Grant, Chris Bonington, Ang Nyima Sherpa",,Dyhrenfurth history 1960-1976 (MM 51:36-37 Sep...,N Face (MM 51:36 Sep 1976)
4,ANN3,Annapurna III,,Annapurna Himal,7555,24787,1,5,True,False,...,"Other map altitudes:\r\n 7555m - HMG-MT, HMG...",1961,1,ANN361101,May 06,India,"Mohan S. Kohli, Sonam Gyatso, Sonam Girmi",,,S Side (MM 125:11 Jan 1989)\r\nSW Face (MM 71:...


In [142]:
df_peaks.columns = df_peaks.columns.str.lower()

In [143]:
df_peaks.columns

Index(['peakid', 'pkname', 'pkname2', 'location', 'heightm', 'heightf',
       'himal', 'region', 'open', 'unlisted', 'trekking', 'trekyear',
       'restrict', 'phost', 'pstatus', 'peakmemo', 'pyear', 'pseason',
       'pexpid', 'psmtdate', 'pcountry', 'psummiters', 'psmtnote', 'refermemo',
       'photomemo'],
      dtype='object')

In [144]:
col_list = [
  'peakid',
  'location',
  'heightm',
  'himal',
  'region',
  'unlisted',
  'trekking',
  'phost',
  'pstatus',
]

In [145]:
df_peaks = df_peaks[col_list]

In [146]:
df_peaks.shape

(479, 9)

In [147]:
df_peaks.head()

Unnamed: 0,peakid,location,heightm,himal,region,unlisted,trekking,phost,pstatus
0,AMAD,Khumbu Himal,6814,12,2,False,False,1,2
1,AMPG,Khumbu Himal (N of Ama Dablam),5630,12,2,False,False,1,2
2,ANN1,Annapurna Himal,8091,1,5,False,False,1,2
3,ANN2,Annapurna Himal,7937,1,5,False,False,1,2
4,ANN3,Annapurna Himal,7555,1,5,False,False,1,2


In [148]:
df_peaks.region.value_counts()

region
2    132
7     87
5     74
1     69
4     41
6     40
3     36
Name: count, dtype: int64

## Expeditions

In [25]:
df_exped.head()

Unnamed: 0,EXPID,PEAKID,YEAR,SEASON,HOST,ROUTE1,ROUTE2,ROUTE3,ROUTE4,NATION,...,ACCIDENTS,ACHIEVMENT,AGENCY,COMRTE,STDRTE,PRIMRTE,PRIMMEM,PRIMREF,PRIMID,CHKSUM
0,ANN260101,ANN2,1960,1,1,NW Ridge-W Ridge,,,,UK,...,,,,,,False,False,,,2442047
1,ANN269301,ANN2,1969,3,1,NW Ridge-W Ridge,,,,Yugoslavia,...,Draslar frostbitten hands and feet,,,,,False,False,,,2445501
2,ANN273101,ANN2,1973,1,1,W Ridge-N Face,,,,Japan,...,,,,,,False,False,,,2446797
3,ANN278301,ANN2,1978,3,1,N Face-W Ridge,,,,UK,...,,,,,,False,False,,,2448822
4,ANN279301,ANN2,1979,3,1,N Face-W Ridge,NW Ridge of A-IV,,,UK,...,,,,,,False,False,,,2449204


In [26]:
df_exped.shape

(11323, 66)

In [27]:
df_exped.columns = df_exped.columns.str.lower()

In [None]:
col_list = [
  # key columns
  'expid',
  'peakid',
  'year',
  'season',
  'host',
  'route1',
  'route2',
  'route3',
  'route4',
  'nation',
  'leaders',
  'sponsor',
  'success1',
  'success2',
  'success3',
  'success4',
  'claimed',
  'disputed',
  'countries',
  'approach',
  'highpoint',
  'traverse',
  'ski',
  'parapente',
  'camps',
  'rope',
  'totmembers',
  'smtmembers',
  'tothired',
  'smthired',
  'nohired',
  'o2used',
  'o2none',
  'o2climb',
  'o2descent',
  'o2sleep',
  'o2medical',
  'o2taken',
  'o2unkwn',
  'othersmts',
  'campsites',
  'routememo',
  'agency',
  'comrte',
  'stdrte'
]

In [30]:
df_exped = df_exped[col_list]

In [31]:
df_exped = df_exped.loc[
  (df_exped.traverse == False) &
  (df_exped.ski == False) &
  (df_exped.parapente == False)
]

In [32]:
df_exped.drop(['traverse', 'ski', 'parapente'], inplace=True, axis=1)

In [33]:
df_exped.shape

(10997, 46)