# Survival Analysis in Python

SEER Survival Data

Allen B. Downey

[MIT License](https://en.wikipedia.org/wiki/MIT_License)

In [None]:
# Configure Jupyter so figures appear in the notebook
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white')

import utils
from utils import decorate
from empyrical_dist import Pmf, Cdf, Surv, Hazard

In [None]:
dat_file = 'SEER/SEER_1975_2016_TEXTDATA/incidence/yr1975_2016.seer9/OTHER.TXT'
    
names = ['SEX', 'AGE_DX', 'YEAR_DX', 'PRIMSITE', 'GRADE', 'STAT_REC', 'SRV_TIME_MON', 'SRV_TIME_MON_FLAG']
    
colspecs = [(24-1, 24),
            (25-1, 27),
            (39-1, 42),
            (43-1, 46),
            (58-1, 58),
            (265-1, 265),
            (301-1, 304),
            (305-1, 305),
           ]

df = pd.read_fwf(dat_file,
                 colspecs=colspecs,
                 names=names,
                 header=None,
                 #nrows=10000,
                 #compression='gzip'
                )

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df['STAT_REC'].value_counts()

In [None]:
df['PRIMSITE'].value_counts()

In [None]:
df['SRV_TIME_MON'].replace(9999, np.nan, inplace=True)
Cdf.from_seq(df['SRV_TIME_MON']).plot()

In [None]:
brain = df['PRIMSITE'].str.startswith('C71')

In [None]:
brain.sum()

In [None]:
df[brain].to_hdf('brain.hd5', 'brain')

In [None]:
!ls -l

In [None]:
brain = pd.read_hdf('brain.hd5', 'brain')

In [None]:
brain.head()

In [None]:
Cdf.from_seq(brain['YEAR_DX']).plot()