In [482]:
import numpy as np
import pandas as pd
from dbfread import DBF

## Data Import

In [483]:
def read_dbf(filename: str) -> pd.DataFrame:
  return pd.DataFrame(DBF(filename))

In [484]:
# MEMBERS
df_members = read_dbf('./data/members.DBF')

# EXPEDITIONS
df_exped = read_dbf('./data/exped.DBF')

# PEAKS
df_peaks = read_dbf('./data/peaks.DBF')

## Data Cleaning

In [485]:
def standardize_colnames(df: pd.DataFrame) -> pd.DataFrame:
  df.columns = df.columns.str.lower()

### Members

In [486]:
df_members.head()

Unnamed: 0,EXPID,MEMBID,PEAKID,MYEAR,MSEASON,FNAME,LNAME,SEX,AGE,BIRTHDATE,...,MEMBERMEMO,NECROLOGY,MSMTBID,MSMTTERM,HCN,MCHKSUM,MSMTNOTE1,MSMTNOTE2,MSMTNOTE3,DEATHRTE
0,AMAD78301,1,AMAD,1978,3,Jean Robert,Clemenson,M,0,,...,,,1,4,0,2426937,,,,
1,AMAD78301,2,AMAD,1978,3,Bernard,Dufour,M,0,,...,,,1,4,0,2426501,,,,
2,AMAD78301,3,AMAD,1978,3,Philippe,Gerard,M,0,,...,,,1,4,0,2431569,,,,
3,AMAD78301,4,AMAD,1978,3,Eric,Lasserre,M,0,,...,,,1,4,0,2426809,,,,
4,AMAD78301,5,AMAD,1978,3,Guy,Peters,M,0,,...,,,1,4,0,2429215,,,,


In [487]:
standardize_colnames(df_members)

In [488]:
col_list = [
  # ID COLUMNS
  'expid',
  'membid',
  'peakid',
  # EXPEDITION START TIME
  'myear',
  'mseason',
  # MEMBER DETAILS
  'sex',
  'calcage',
  'status',
  'occupation',
  # MEMBER ROLE
  'leader',
  'deputy',
  'bconly',
  'nottobc',
  'support',
  'disabled',
  'hired',
  'sherpa',
  'tibetan',
  # EXPEDITION OUTCOME
  'msuccess',
  # EXPEDITION TYPE
  'msolo',
  'mtraverse',
  'mski',
  'mparapente',
  'mspeed',
  # SUMMIT BID
  'msmtbid',
  'msmtterm'
]

In [489]:
df_members = df_members[col_list]

In [490]:
df_members.shape

(87156, 26)

In [491]:
df_members = df_members.loc[
  (df_members.bconly == False) &
  (df_members.nottobc == False) &
  (df_members.support == False) &
  (df_members.hired == False) &
  (df_members.sherpa == False) &
  (df_members.tibetan == False)
]

In [492]:
df_members.drop(['bconly', 'nottobc', 'support', 'disabled', 'hired', 'sherpa', 'tibetan'], axis=1, inplace=True)

In [493]:
df_members.shape

(60585, 19)

In [494]:
df_members = df_members.loc[
  (df_members.mtraverse == False) &
  (df_members.mski == False) &
  (df_members.mparapente == False) &
  (df_members.mspeed == False),
  :
]

In [495]:
df_members.drop(['mtraverse', 'mski', 'mparapente', 'mspeed'], axis=1, inplace=True)

In [496]:
df_members.status = df_members.status.str.lower()

In [497]:
df_members = df_members.loc[
  (df_members.status.str.contains('climb')) |
  (df_members.status.str.contains('lead')),
  :
]

In [498]:
df_members = df_members.loc[
  (~df_members.status.str.contains('non-climber')) &
  (~df_members.status.str.contains('ski')) &
  (~df_members.status.str.contains('paraglider')) &
  (~df_members.status.str.contains('camera')) &
  (~df_members.status.str.contains('photo')) &
  (~df_members.status.str.contains('film')) &
  (~df_members.status.str.contains('reporter')) &
  (~df_members.status.str.contains('journalist')) &
  (~df_members.status.str.contains('coach')) &
  (~df_members.status.str.contains('advisor')) &
  (~df_members.status.str.contains('instructor')) &
  (~df_members.status.str.contains('support')) &
  (~df_members.status.str.contains('guide')) &
  (~df_members.status.str.contains('torch')) &
  (~df_members.status.str.contains('only'))
]

In [499]:
df_members.status.unique()

array(['climber', 'climbing leader', 'leader', 'dep climbing leader',
       'xclimber', 'leader (n)', 'climbing ldr (n)', 'climber (n)',
       'team leader (n)', 'leader (s)', 'climber (s)', 'team leader (s)',
       'climbing ldr (s)', 'army leader (s)', 'co-leader',
       'deputy ldr/climbing ldr', 'deputy leader', 'climber (s face)',
       'dep climbing ldr (s)', 'climber/bc mgr', 'leader/exp doctor',
       'climber (guest)', 'climber (n col)', 'climber (trainee)',
       'assistant leader', 'leader (nominal)', 'leader?',
       'climber (group b)', 'co-leader (group b)', 'climber (group a)',
       'co-leader (group a)', 'lhotse climber', 'climbe',
       'leader (nomimal)', 'climber (pvt)', 'climber (lhotse)',
       'climber (grp 1)', 'climber (grp 2)', 'climber (mm)',
       'climber (css)', 'deputry leader', 'climber (xfr)', 'climbev',
       'climber/camerman'], dtype=object)

In [500]:
df_members.drop(['status'], axis=1, inplace=True)

### Peaks

In [501]:
df_peaks.shape

(479, 25)

In [502]:
standardize_colnames(df_peaks)

In [None]:
col_list = [
  # ID COLUMN
  'peakid',
  # PEAK INFO
  # 'pkname',
  # 'pkname2',
  'location',
  'heightm',
  # 'heightf',
  'himal',
  'region',
  # PEAK STATUS
  'open',
  'unlisted',
  'trekking',
  # 'trekyear',
  # 'restrict',
  'phost',
  'pstatus',
  # 'peakmemo',
  # 'pyear',
  # 'pseason',
  # 'pexpid',
  # 'psmtdate',
  # 'pcountry',
  # 'psummiters',
  # 'psmtnote',
  # 'refermemo',
  # 'photomemo'
]

In [504]:
df_peaks = df_peaks[col_list]

In [505]:
df_peaks = df_peaks.loc[df_peaks.trekking == False]
df_peaks.drop('trekking', axis=1, inplace=True)

In [506]:
df_peaks.shape

(448, 11)

In [507]:
df_peaks.head()

Unnamed: 0,peakid,location,heightm,himal,region,open,unlisted,trekyear,restrict,phost,pstatus
0,AMAD,Khumbu Himal,6814,12,2,True,False,,,1,2
1,AMPG,Khumbu Himal (N of Ama Dablam),5630,12,2,True,False,,Opened in 2002,1,2
2,ANN1,Annapurna Himal,8091,1,5,True,False,,,1,2
3,ANN2,Annapurna Himal,7937,1,5,True,False,,,1,2
4,ANN3,Annapurna Himal,7555,1,5,True,False,,,1,2


### Expeditions

In [508]:
standardize_colnames(df_exped)

In [509]:
df_exped.head()

Unnamed: 0,expid,peakid,year,season,host,route1,route2,route3,route4,nation,...,accidents,achievment,agency,comrte,stdrte,primrte,primmem,primref,primid,chksum
0,ANN260101,ANN2,1960,1,1,NW Ridge-W Ridge,,,,UK,...,,,,,,False,False,,,2442047
1,ANN269301,ANN2,1969,3,1,NW Ridge-W Ridge,,,,Yugoslavia,...,Draslar frostbitten hands and feet,,,,,False,False,,,2445501
2,ANN273101,ANN2,1973,1,1,W Ridge-N Face,,,,Japan,...,,,,,,False,False,,,2446797
3,ANN278301,ANN2,1978,3,1,N Face-W Ridge,,,,UK,...,,,,,,False,False,,,2448822
4,ANN279301,ANN2,1979,3,1,N Face-W Ridge,NW Ridge of A-IV,,,UK,...,,,,,,False,False,,,2449204
