In [372]:
import numpy as np
import pandas as pd
from dbfread import DBF

## Data Import

In [373]:
def read_dbf(filename: str) -> pd.DataFrame:
  return pd.DataFrame(DBF(filename))

In [374]:
# MEMBERS
df_members = read_dbf('./data/members.DBF')

# EXPEDITIONS
df_exped = read_dbf('./data/exped.DBF')

# PEAKS
df_peaks = read_dbf('./data/peaks.DBF')

## Data Cleaning

In [375]:
def standardize_colnames(df: pd.DataFrame) -> pd.DataFrame:
  df.columns = df.columns.str.lower()

### Members

In [376]:
df_members.head()

Unnamed: 0,EXPID,MEMBID,PEAKID,MYEAR,MSEASON,FNAME,LNAME,SEX,AGE,BIRTHDATE,...,MEMBERMEMO,NECROLOGY,MSMTBID,MSMTTERM,HCN,MCHKSUM,MSMTNOTE1,MSMTNOTE2,MSMTNOTE3,DEATHRTE
0,AMAD78301,1,AMAD,1978,3,Jean Robert,Clemenson,M,0,,...,,,1,4,0,2426937,,,,
1,AMAD78301,2,AMAD,1978,3,Bernard,Dufour,M,0,,...,,,1,4,0,2426501,,,,
2,AMAD78301,3,AMAD,1978,3,Philippe,Gerard,M,0,,...,,,1,4,0,2431569,,,,
3,AMAD78301,4,AMAD,1978,3,Eric,Lasserre,M,0,,...,,,1,4,0,2426809,,,,
4,AMAD78301,5,AMAD,1978,3,Guy,Peters,M,0,,...,,,1,4,0,2429215,,,,


In [377]:
standardize_colnames(df_members)

In [378]:
column_list = df_members.columns.to_list()

In [379]:
column_list

['expid',
 'membid',
 'peakid',
 'myear',
 'mseason',
 'fname',
 'lname',
 'sex',
 'age',
 'birthdate',
 'yob',
 'calcage',
 'citizen',
 'status',
 'residence',
 'occupation',
 'leader',
 'deputy',
 'bconly',
 'nottobc',
 'support',
 'disabled',
 'hired',
 'sherpa',
 'tibetan',
 'msuccess',
 'mclaimed',
 'mdisputed',
 'msolo',
 'mtraverse',
 'mski',
 'mparapente',
 'mspeed',
 'mhighpt',
 'mperhighpt',
 'msmtdate1',
 'msmtdate2',
 'msmtdate3',
 'msmttime1',
 'msmttime2',
 'msmttime3',
 'mroute1',
 'mroute2',
 'mroute3',
 'mascent1',
 'mascent2',
 'mascent3',
 'mo2used',
 'mo2none',
 'mo2climb',
 'mo2descent',
 'mo2sleep',
 'mo2medical',
 'mo2note',
 'death',
 'deathdate',
 'deathtime',
 'deathtype',
 'deathhgtm',
 'deathclass',
 'ams',
 'weather',
 'injury',
 'injurydate',
 'injurytime',
 'injurytype',
 'injuryhgtm',
 'deathnote',
 'membermemo',
 'necrology',
 'msmtbid',
 'msmtterm',
 'hcn',
 'mchksum',
 'msmtnote1',
 'msmtnote2',
 'msmtnote3',
 'deathrte']

In [380]:
col_list = [
  # ID COLUMNS
  'expid',
  'membid',
  'peakid',
  # EXPEDITION START TIME
  'myear',
  'mseason',
  # MEMBER DETAILS
  # 'fname',
  # 'lname',
  'sex',
  # 'age',
  # 'birthdate',
  # 'yob',
  'calcage',
  # 'citizen',
  'status',
  # 'residence',
  'occupation',
  # MEMBER ROLE
  'leader',
  'deputy',
  'bconly',
  'nottobc',
  'support',
  'disabled',
  'hired',
  'sherpa',
  'tibetan',
  # EXPEDITION OUTCOME
  'msuccess',
  # 'mclaimed',
  # 'mdisputed',
  # EXPEDITION TYPE
  'msolo',
  'mtraverse',
  'mski',
  'mparapente',
  'mspeed',
  # HIGHPOINT
  # 'mhighpt', # expedition highpoint reached
  # 'mperhighpt', # personal highpoint
  # SUMMIT DATE/TIME
  # 'msmtdate1',
  # 'msmtdate2',
  # 'msmtdate3',
  # 'msmttime1',
  # 'msmttime2',
  # 'msmttime3',
  # ASCENT ROUTE
  # 'mroute1',
  # 'mroute2',
  # 'mroute3',
  # 'mascent1',
  # 'mascent2',
  # 'mascent3',
  # OXYGEN USE
  # 'mo2used',
  # 'mo2none',
  # 'mo2climb',
  # 'mo2descent',
  # 'mo2sleep',
  # 'mo2medical',
  # 'mo2note',
  # CLIMBER DEATH
  # 'death',
  # 'deathdate',
  # 'deathtime',
  # 'deathtype',
  # 'deathhgtm',
  # 'deathclass',
  # 'ams',
  # 'weather',
  # CLIMBER INJURY
  # 'injury',
  # 'injurydate',
  # 'injurytime',
  # 'injurytype',
  # 'injuryhgtm',
  # 'deathnote',
  # 'membermemo',
  # 'necrology',
  # SUMMIT BID
  'msmtbid',
  'msmtterm',
  # 'hcn',
  # 'mchksum',
  # 'msmtnote1',
  # 'msmtnote2',
  # 'msmtnote3',
  # 'deathrte'
]

In [381]:
df_members = df_members[col_list]

In [382]:
df_members.shape

(87156, 26)

In [383]:
df_members = df_members.loc[
  (df_members.bconly == False) &
  (df_members.nottobc == False) &
  (df_members.support == False) &
  (df_members.hired == False) &
  (df_members.sherpa == False) &
  (df_members.tibetan == False)
]

In [384]:
df_members.drop(['bconly', 'nottobc', 'support', 'disabled', 'hired', 'sherpa', 'tibetan'], axis=1, inplace=True)

In [385]:
df_members.shape

(60585, 19)

In [386]:
df_members = df_members.loc[
  (df_members.mtraverse == False) &
  (df_members.mski == False) &
  (df_members.mparapente == False) &
  (df_members.mspeed == False),
  :
]

In [387]:
df_members.drop(['mtraverse', 'mski', 'mparapente', 'mspeed'], axis=1, inplace=True)

In [388]:
df_members.shape

(25731, 15)

In [389]:
df_members.head()

Unnamed: 0,expid,membid,peakid,myear,mseason,sex,calcage,status,occupation,leader,deputy,msuccess,msolo,msmtbid,msmtterm
56,AMAD79303,3,AMAD,1979,3,M,35,Climber,Alpine guide,False,False,False,False,1,14
192,AMAD85101,2,AMAD,1985,1,M,20,Climber,"Student, commercial school",False,False,True,False,5,1
209,AMAD85301,7,AMAD,1985,3,M,28,Climber,Alpine guide,False,False,True,False,5,1
213,AMAD85303,2,AMAD,1985,3,M,41,Climber,Alpine guide,False,False,True,False,5,1
458,AMAD90301,6,AMAD,1990,3,F,39,Climber,Secretary,False,False,False,False,1,4


In [390]:
df_members.status = df_members.status.str.lower()

In [391]:
df_members = df_members.loc[
  (df_members.status.str.contains('climb')) |
  (df_members.status.str.contains('lead')),
  :
]

In [392]:
df_members = df_members.loc[
  (~df_members.status.str.contains('non-climber')) &
  (~df_members.status.str.contains('ski')) &
  (~df_members.status.str.contains('paraglider')) &
  (~df_members.status.str.contains('camera')) &
  (~df_members.status.str.contains('photo')) &
  (~df_members.status.str.contains('film')) &
  (~df_members.status.str.contains('reporter')) &
  (~df_members.status.str.contains('journalist')) &
  (~df_members.status.str.contains('coach')) &
  (~df_members.status.str.contains('advisor')) &
  (~df_members.status.str.contains('instructor')) &
  (~df_members.status.str.contains('support')) &
  (~df_members.status.str.contains('guide')) &
  (~df_members.status.str.contains('only'))
]

In [393]:
df_members.status.unique()

array(['climber', 'climbing leader', 'leader', 'dep climbing leader',
       'xclimber', 'leader (n)', 'climbing ldr (n)', 'climber (n)',
       'team leader (n)', 'leader (s)', 'climber (s)', 'team leader (s)',
       'climbing ldr (s)', 'army leader (s)', 'co-leader',
       'deputy ldr/climbing ldr', 'deputy leader', 'climber (s face)',
       'climber (torch)', 'climber (torchbear 4)',
       'climb ldr (torchbearer 2)', 'dep climbing ldr (s)',
       'climber/bc mgr', 'leader/exp doctor', 'climber (guest)',
       'climber (n col)', 'climber (trainee)', 'assistant leader',
       'leader (nominal)', 'leader?', 'climber (group b)',
       'co-leader (group b)', 'climber (group a)', 'co-leader (group a)',
       'lhotse climber', 'climbe', 'leader (nomimal)', 'climber (pvt)',
       'climber (lhotse)', 'climber (grp 1)', 'climber (grp 2)',
       'climber (mm)', 'climber (css)', 'deputry leader', 'climber (xfr)',
       'climbev', 'climber/camerman'], dtype=object)

In [394]:
df_members.drop(['status'], axis=1, inplace=True)

### Peaks

### Expeditions