In [1]:
#########################################################################################
# Data Science Training Workshop: Using Jupyter Notebook and R (with a little Spark)
# Presenters: Leslie McIntosh, PhD, MPH; Connie Zabarovskaya, MITM; Lorinette Wirth, MPH
# 
# This script reads BREAST.txt data and parses into "original_data", which has 483,489 rows.
# It limits data to diagnosis years 1973-2002 and only 21 variables seen in data_dictionary 
# The subset is saved as data_sample.
# The script then creates the classification variable (survived) and "cleans" up the 
# rest of the variables for analysis. During the cleanup we also create one more 
# predictor variable based on Number of Positive Nodes (PN_PRESENT)
#########################################################################################

# SEER Research Data 1973-2013 -- ASCII Text Data:  Surveillance, Epidemiology, and End Results
# (SEER) Program (www.seer.cancer.gov) Research Data (1973-2013), National Cancer Institute,
# DCCPS, Surveillance Research Program, Surveillance Systems Branch, released April 2016,
# based on the November 2015 submission.
# see https://seer.cancer.gov/data/documentation.html

In [2]:
# The three txt files below were prepared using Python, however, they can be prepared using another tool.
# Create vector of var names and widths from Python output then cobine as data frame 
names<- scan("name.txt", what="",sep=",")
width<-scan("width.txt",what="",sep=",")
start<-scan("start.txt",what="",sep=",")

start<-as.numeric(start)
width<-as.numeric(width)
names<-as.character(names)
info<-data.frame(names,start,width)
head(info)

names,start,width
PUBCSNUM,1,8
REG,9,10
MAR_STAT,19,1
RACE1V,20,2
NHIADE,23,1
SEX,24,1


In [3]:
# read in breast.txt where each line is one record 

read <- proc.time()
raw_data <- readLines("BREAST.TXT")
paste0("read time: ")
proc.time() - read

trans <- proc.time()
# function to split each line per start and end of each variable
parseFixedWidthFile <- function(data, data_dictionary){
  empty_df<- as.data.frame(matrix(0, ncol = 134, nrow = length(data)))
  for (i in 1:nrow(data_dictionary)) {
    newdata<-unlist(lapply(data,substr,data_dictionary$start[i],data_dictionary$start[i]+data_dictionary$width[i]-1))
    empty_df[i]<-newdata
  }
  empty_df
}

original_data <- parseFixedWidthFile(raw_data, info)
# rename columns
names(original_data) <- info$names
paste0("transformation time: ")
proc.time() - trans

save(original_data, file = "original_data.rdata" )

   user  system elapsed 
  7.950  14.460  25.746 

   user  system elapsed 
 791.77    0.14  792.92 

In [4]:
load("original_data.rdata")

names(original_data)<-trimws(names(original_data))
# Checking for empty string values
paste0("Checking for empty string values")
table(original_data$YEAR_DX)
original_data$YEAR_DX <- as.numeric(original_data$YEAR_DX)
# limit to case dx between 1973 and 2002 
data_sample <- original_data[original_data$YEAR_DX >= 1973 & original_data$YEAR_DX <= 2002,]
# data_sample is 483489 rows


 1973  1974  1975  1976  1977  1978  1979  1980  1981  1982  1983  1984  1985 
 7563 10016 10237 10029  9964 10127 10517 10745 11332 11537 12320 13247 14639 
 1986  1987  1988  1989  1990  1991  1992  1993  1994  1995  1996  1997  1998 
15413 16965 16924 16593 17685 18284 18666 18578 19302 20167 20715 22070 23526 
 1999  2000  2001  2002  2003  2004  2005  2006  2007  2008  2009  2010  2011 
23874 23608 24429 24417 23349 23961 24057 24560 25565 26273 27207 26520 27770 
 2012  2013 
27937 28573 

In [5]:
keepVar<-c(
  "MAR_STAT"     ,
  "RACE1V"       ,
  "AGE_DX"       ,
  "PRIMSITE"     ,
  "GRADE"        ,
  "EOD10_SZ"     ,
  "EOD10_EX"     , 
  "EOD10_ND"     ,
  "EOD10_PN"     ,  
  "EOD10_NE"     , 
  "RADIATN"      , 
  "ADJAJCCSTG"   ,  
  "SEQ_NUM"      ,
  "HISTO3V"      ,
  "BEHO3V"       ,
  "SS_SURG"      ,
  "SURGPRIF"     ,
  "srv_time_mon" ,
  "STAT_REC"     ,
  "CODPUB"       ,
  "PUBCSNUM"     ,
  "REC_NO"       
) 
data_sample<-data_sample[,names(data_sample) %in% keepVar ]

save(data_sample,file="data_sample.rdata")

In [6]:
# ON DOCKER: If you're not using conniez/all-spark-notebook image, uncomment the install command. 
# This needs to be run only once per container run (so the package is installed)
# suppressWarnings(suppressMessages(install.packages("dplyr", lib="/opt/conda/lib/R/library", repo="http://cran.us.r-project.org")))
# ON ANACONDA: If you haven't yet installed dplyr package, uncomment this command
# suppressWarnings(suppressMessages(install.packages("dplyr", repo="http://cran.us.r-project.org")))

In [7]:
suppressWarnings(suppressMessages(library(dplyr)))
load("data_sample.rdata")

# Recoding Survivability Variable

# Cause of Death Codes downloaded with description and cancer, vs. non-cancer relatedness from here:
# https://seer.cancer.gov/codrecode/1969+_d09172004/index.txt

data_sample$srv_time_mon <- as.integer(data_sample$srv_time_mon)
# Cause of Death - Breast Cancer code is 26000
# Vital Status Recode (STAT_REC) Alive code is 1

# assign survived variable value 1 - survived, 0 - not survived, NA - to be ignored
data_sample <- mutate(data_sample, survived = ifelse(srv_time_mon >= 60 & srv_time_mon != 9999 & STAT_REC == '1', 1,
                                                     ifelse(srv_time_mon < 60 & CODPUB == '26000', 0,NA)))
# Display number of records where survived is not missing (NA)
paste0("Number of rows in data_sample")
nrow(data_sample[!is.na(data_sample$survived),])
# records with survival 1 and 0 for breast cancer - 238457, the rest should be ignored.

# Display survivability breakdown
paste0("Distribution of survived variable")
table(data_sample$survived)

# Drop NA values
data_sample <- data_sample[!is.na(data_sample$survived),]


     0      1 
 67749 170708 

In [8]:
str(data_sample)

'data.frame':	238457 obs. of  23 variables:
 $ PUBCSNUM    : chr  "07000003" "07000112" "07000156" "07000198" ...
 $ MAR_STAT    : chr  "2" "2" "5" "2" ...
 $ RACE1V      : chr  "01" "02" "01" "01" ...
 $ AGE_DX      : chr  "060" "045" "076" "058" ...
 $ SEQ_NUM     : chr  "02" "02" "02" "02" ...
 $ PRIMSITE    : chr  "C505" "C509" "C508" "C504" ...
 $ HISTO3V     : chr  "8500" "8500" "8500" "8500" ...
 $ BEHO3V      : chr  "3" "3" "3" "3" ...
 $ GRADE       : chr  "9" "9" "9" "3" ...
 $ EOD10_SZ    : chr  "008" "   " "021" "   " ...
 $ EOD10_EX    : chr  "10" "  " "10" "  " ...
 $ EOD10_ND    : chr  "0" " " "6" " " ...
 $ EOD10_PN    : chr  "00" "  " "02" "  " ...
 $ EOD10_NE    : chr  "15" "  " "08" "  " ...
 $ SURGPRIF    : chr  "  " "  " "  " "  " ...
 $ RADIATN     : chr  "0" "0" "0" "0" ...
 $ SS_SURG     : chr  "20" "90" "50" "90" ...
 $ REC_NO      : chr  "01" "01" "01" "01" ...
 $ CODPUB      : chr  "00000" "00000" "26000" "26000" ...
 $ STAT_REC    : chr  "1" "1" "4" "4" ...


In [9]:
# Identifying and Recoding Missing Values for Predictor Variables

# Tumor Size
# Remove values of tumor size >200mm in accordance with Paper by Delen et al.
# Tumor size in Paper by Bellaachia and Guven: mean=20, sd=16, range=0-200
# Checking for empty string values
sort(unique(data_sample$EOD10_SZ))
# There are values consisting of 3 spaces
data_sample$EOD10_SZ[data_sample$EOD10_SZ %in% c('   ')] <- NA
data_sample$EOD10_SZ <- as.integer(data_sample$EOD10_SZ)
data_sample$EOD10_SZ[data_sample$EOD10_SZ > 200]<-NA
EOD10_SZrange <- range(data_sample$EOD10_SZ,na.rm = T)
paste0("Range: ",paste(EOD10_SZrange, collapse = "-"))
EOD10_SZmean <- round(mean(data_sample$EOD10_SZ,na.rm = T), 2)
paste0("Mean: ",EOD10_SZmean)
EOD10_SZsd <- round(sd(data_sample$EOD10_SZ,na.rm = T),2)
paste0("Standard Deviation: ",EOD10_SZsd)
paste0("Number of records with missing or invalid tumor size")
table(is.na(data_sample$EOD10_SZ))


 FALSE   TRUE 
150208  88249 

In [10]:
# Age
# Age in Paper by Bellaachia and Guven:mean=58, sd=13, range=10-110
# Checking for empty string values
sort(unique(data_sample$AGE_DX))
data_sample$AGE_DX[data_sample$AGE_DX %in% c(999, ' ')] <- NA
data_sample$AGE_DX<- as.integer(data_sample$AGE_DX)
AGE_DXrange <- range(data_sample$AGE_DX,na.rm = T)
paste0("Range: ",paste(AGE_DXrange, collapse = "-"))
AGE_DXmean <- round(mean(data_sample$AGE_DX,na.rm = T),2)
paste0("Mean: ",AGE_DXmean)
AGE_DXsd <- round(sd(data_sample$AGE_DX,na.rm = T),2)
paste0("Standard Deviation: ",AGE_DXsd)
paste0("Number of records with missing or invalid AGE_DX")
table(is.na(data_sample$AGE_DX))


 FALSE   TRUE 
238451      6 

In [11]:
# Presense of Positive Nodes
# This is a variable calculated based on No. of Positive Nodes
# See No. of Positive Nodes codes for details
# 00, 95 - code as 0, "No positive nodes present"
# 01-90 - code as 1, "Positive nodes present"
# 98, 99 - code as NA, "No nodes were examined OR Unknown whether nodes are positive" 
data_sample <- mutate(data_sample, PN_PRESENT = ifelse(EOD10_PN %in% c("95","00"), 0,
                                                ifelse(EOD10_PN %in% c("98","99"), NA, 1)))
data_sample$PN_PRESENT <- as.factor(data_sample$PN_PRESENT)
table(data_sample$PN_PRESENT)
PN_PRESENTlev <- nlevels(data_sample$PN_PRESENT)
paste0("Number of records with missing or invalid PN_PRESENT")
table(is.na(data_sample$PN_PRESENT))


     0      1 
 86941 102258 


 FALSE   TRUE 
189199  49258 

In [12]:
# No. of Positive Nodes
# This field has some weird values
# Code Description
# 00 All nodes examined are negative
# 01-89 Exact number of nodes positive
# 90 90 or more nodes are positive
# 95 Positive aspiration of lymph node(s) was performed
# 97 Positive nodes are documented, but number is unspecified
# 98 No nodes were examined
# 99 Unknown whether nodes are positive; not applicable; not
# stated in patient record

# Checking for empty string values
sort(unique(data_sample$EOD10_PN))
# Based on that dictionary, consider renaming all values >90 to be NA
data_sample$EOD10_PN <- as.integer(data_sample$EOD10_PN)
# this also creates NAs if value was empty string
table(data_sample$EOD10_PN[data_sample$EOD10_PN > 90])
data_sample$EOD10_PN[data_sample$EOD10_PN > 90]<-NA
EOD10_PNrange <- range(data_sample$EOD10_PN,na.rm=T)
paste0("Range: ", paste(EOD10_PNrange, collapse="-"))
EOD10_PNmean <- round(mean(data_sample$EOD10_PN,na.rm=T),2)
paste0("Mean: ",EOD10_PNmean)
EOD10_PNsd <- round(sd(data_sample$EOD10_PN,na.rm=T),2)
paste0("Standard Deviation: ",EOD10_PNsd)
paste0("Number of records with missing or invalid EOD10_PN")
table(is.na(data_sample$EOD10_PN))


   95    97    98    99 
    1   899 48148  1110 


 FALSE   TRUE 
127395 111062 

In [13]:
# Number of Nodes Examined (Number of Nodes)
# Code Description
# 00 No nodes were examined
# 01-89 Exact number of nodes examined
# 90 90 or more nodes were examined
# 95 No regional nodes were removed, but aspiration of regional
# nodes was performed
# 96 Regional lymph node removal was documented as a
# sampling, and the number of nodes is unknown/not stated
# 97 Regional lymph node removal was documented as a
# dissection, and the number of nodes is unknown/not stated
# 98 Regional lymph nodes were surgically removed, but the
# number of lymph nodes is unknown/not stated and not
# documented as a sampling or dissection; nodes were
# examined, but the number is unknown
# 99 Unknown whether nodes were examined; not applicable or
# negative; not stated in patient record

# Checking for empty string values
sort(unique(data_sample$EOD10_NE))
# Based on that dictionary, consider renaming all values >90 to be NA
data_sample$EOD10_NE<-as.integer(data_sample$EOD10_NE)
table(data_sample$EOD10_NE[data_sample$EOD10_NE > 90])
data_sample$EOD10_NE[data_sample$EOD10_NE > 90]<-NA
EOD10_NErange <- range(data_sample$EOD10_NE,na.rm=T)
paste0("Range: ",paste(EOD10_NErange, collapse="-"))
EOD10_NEmean <- round(mean(data_sample$EOD10_NE,na.rm=T), 2)
paste0("Mean: ",EOD10_NEmean)
EOD10_NEsd <- round(sd(data_sample$EOD10_NE,na.rm=T), 2)
paste0("Standard Deviation: ",EOD10_NEsd)
paste0("Number of records with missing or invalid EOD10_NE")
table(is.na(data_sample$EOD10_NE))


  95   96   97   98   99 
  74   28  328 1923 1034 


 FALSE   TRUE 
174166  64291 

In [14]:
# Number of primaries
# In Situ/Malignant as Federally Required based on Diagnosis Year
# Code Description
# 00 One primary only in the patient's lifetime
# 01 First of two or more primaries
# 02 Second of two or more primaries
# .. (Actual number of this primary)
# 41 Forty-first of forty-one or more primaries
# 99 Unspecified or unknown sequence number of Federally required in situ or malignant
# tumors. Sequence number 99 can be used if there is a malignant tumor and its sequence
# number is unknown. (If there is known to be more than one malignant tumor, then the
#                     tumors must be sequenced.)
# Non-malignant Tumor as Federally Required based on Diagnosis Year
# Code Description
# 60 Only one non-malignant tumor or central registry-defined neoplasm
# 61 First of two or more non-malignant tumors or central registry-defined neoplasms
# 62 Second of two or more non-malignant tumors or central registry-defined neoplasms
# .. ..
# 87 Twenty-seventh of twenty-seven
# 88 Unspecified or unknown sequence number of non-malignant tumor or central-registry
# defined neoplasms. (Sequence number 88 can be used if there is a non-malignant tumor
#                     and its sequence number is unknown. If there is known to be more than one nonmalignant
#                     tumor, then the tumors must be sequenced.)

# Checking for empty string values
as.data.frame(table(data_sample$SEQ_NUM))
# should not convert to numeric as is, because 0 actually means a single primary.
# perhaps leave this as character var, or convert to factor, if decision tree requires factors (which it sometimes does)
# however, based on Paper by Bellaachia and Guven, perhaps we could rename 0 to 1
data_sample$SEQ_NUM[data_sample$SEQ_NUM %in% c(99,88,' ')]<-NA
data_sample$SEQ_NUM<-as.integer(data_sample$SEQ_NUM)
data_sample$SEQ_NUM[data_sample$SEQ_NUM == 0] <- 1
SEQ_NUMrange <- range(data_sample$SEQ_NUM,na.rm=T)
paste0("Range: ",paste(SEQ_NUMrange, collapse = "-"))
SEQ_NUMmean <- round(mean(data_sample$SEQ_NUM,na.rm=T),2)
paste0("Mean: ",SEQ_NUMmean)
SEQ_NUMsd <- round(sd(data_sample$SEQ_NUM,na.rm=T),2)
paste0("Standard Deviation: ",SEQ_NUMsd)
paste0("Number of records with missing or invalid Number of primaries")
table(is.na(data_sample$SEQ_NUM))

Var1,Freq
0,170874
1,39253
2,26007
3,2138
4,161
5,17
6,1
99,6



 FALSE   TRUE 
238451      6 

In [15]:
# Marital Status
# Code Description
# 1 Single (never married)
# 2 Married (including common law)
# 3 Separated
# 4 Divorced
# 5 Widowed
# 6 Unmarried or domestic partner (same sex or opposite sex or
#                                  unregistered)
# 9 Unknown

# Use sort and unique to check for empty string values
sort(unique(data_sample$MAR_STAT))
# Assign NA where MAR_STAT equals 9
data_sample$MAR_STAT[data_sample$MAR_STAT %in% c(9)]<-NA
# Assign 1 where MAR_STAT is 1 or 6 (combine the two categories together)
# Unmarried but living with part combined =with single
data_sample$MAR_STAT[data_sample$MAR_STAT %in% c(1,6)]<-1 
# Convert MAR_STAT to a factor
data_sample$MAR_STAT<-as.factor(data_sample$MAR_STAT)
# Display distribution of values in MAR_STAT
table(data_sample$MAR_STAT)
# Create variable MAR_STATlev to store the number of levels in MAR_STAT variable
MAR_STATlev <- nlevels(data_sample$MAR_STAT)
paste0("Number of factor levels: ",MAR_STATlev)
paste0("Number of records with missing or invalid MAR_STAT")
# Display breakdown of NA and non-missing values in MAR_STAT
table(is.na(data_sample$MAR_STAT))


     1      2      3      4      5 
 25731 150715   2791  22390  29956 


 FALSE   TRUE 
231583   6874 

In [16]:
# Race
# Checking for empty string values
sort(unique(data_sample$RACE1V))

# We combine races that have small representation (< 300) into category "Other"
table(as.factor(data_sample$RACE1V))
race_dist <- as.data.frame(table(as.factor(data_sample$RACE1V)))
data_sample$RACE1V[data_sample$RACE1V %in% race_dist[race_dist$Freq < 300, "Var1"]]<-98
data_sample$RACE1V[data_sample$RACE1V %in% c(99)]<-NA
data_sample$RACE1V<-as.factor(data_sample$RACE1V)
table(data_sample$RACE1V)
RACE1Vlev <- nlevels(data_sample$RACE1V)
paste0("Number of factor levels: ",RACE1Vlev)
table(is.na(data_sample$RACE1V))


    01     02     03     04     05     06     07     08     10     11     12 
201084  20115    980   3330   4865   3086   1693    527    308     31      1 
    13     14     15     16     17     20     21     22     25     26     27 
    24     42    326    117     15     14      4     16     10      2     98 
    28     30     31     32     96     97     98     99 
    38      3     13      4    763     15    267    666 


    01     02     03     04     05     06     07     08     10     15     96 
201084  20115    980   3330   4865   3086   1693    527    308    326    763 
    98 
   714 


 FALSE   TRUE 
237791    666 

In [17]:
# Primary Site code
# Checking for empty string values
sort(unique(data_sample$PRIMSITE))
data_sample$PRIMSITE<-as.factor(data_sample$PRIMSITE)
PRIMSITElev <- nlevels(data_sample$PRIMSITE)
paste0("Number of factor levels: ",PRIMSITElev)
table(data_sample$PRIMSITE)
table(is.na(data_sample$PRIMSITE))


 C500  C501  C502  C503  C504  C505  C506  C508  C509 
 2118 13602 19755 11017 78434 14277  1917 46048 51289 


 FALSE 
238457 

In [18]:
# Histologic type
# Checking for empty string values
# Recode values to reduce number of levels
# Based on these sources, we used the following mapping
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2361680/
# http://jamanetwork.com/journals/jamainternalmedicine/fullarticle/216158
# 
# ".... grouped according to the histologic categories of their tumors as defined 
# by the International Classification of Diseases for Oncology (ICD-O) codes: 
# "0001" - ductal (ICD-O code 8500), 
# "0002" - lobular (8520 and 8522), 
# "0003" - mucinous (8480), 
# "0004" - comedocarcinoma (8501), 
# "0005" - medullary (8510), 
# "0006" - tubular (8211 and 8201), 
# "0007" - and papillary (8050 and 8503)"
# "0008" - Code all others as "Other."

table(data_sample$HISTO3V)
data_sample$HISTO3V[data_sample$HISTO3V %in% c("8500")] <- "0001"
data_sample$HISTO3V[data_sample$HISTO3V %in% c("8520","8522")] <- "0002"
data_sample$HISTO3V[data_sample$HISTO3V %in% c("8480")] <- "0003"
data_sample$HISTO3V[data_sample$HISTO3V %in% c("8501")] <- "0004"
data_sample$HISTO3V[data_sample$HISTO3V %in% c("8510")] <- "0005"
data_sample$HISTO3V[data_sample$HISTO3V %in% c("8211","8201")] <- "0006" 
data_sample$HISTO3V[data_sample$HISTO3V %in% c("8050","8503")] <- "0007"
data_sample$HISTO3V[!data_sample$HISTO3V %in% c("0001","0002","0003","0004","0005","0006","0007")] <- "0008"
data_sample$HISTO3V<-as.factor(data_sample$HISTO3V)
HISTO3Vlev <- nlevels(data_sample$HISTO3V)
paste0("Number of factor levels: ",HISTO3Vlev)
table(is.na(data_sample$HISTO3V))


  8000   8001   8003   8004   8010   8012   8020   8021   8022   8031   8032 
  1110     54      2      4   6641     39    150    137     25      3     46 
  8033   8041   8046   8050   8070   8071   8074   8082   8140   8141   8154 
     7     49      4    533    107     11      7      2   7686    865      1 
  8190   8200   8201   8210   8211   8230   8231   8240   8246   8251   8260 
     2    143   1542      1   2606    791     10      7     16      1    171 
  8310   8314   8315   8323   8345   8400   8401   8430   8440   8470   8480 
    27      1      4      9      1      2    246      1      3      1   3249 
  8481   8490   8500   8501   8502   8503   8504   8507   8508   8510   8512 
   196    106 158292   9593     37   1873    304    382      1   3750    186 
  8513   8514   8520   8521   8522   8523   8524   8530   8540   8541   8542 
    64      5  18846    334  10927   2076    152   2177    233    981      1 
  8543   8550   8560   8562   8570   8571   8572   8573   8575 


 FALSE 
238457 

In [19]:
# Behavior
# Checking for empty string values
sort(unique(data_sample$BEHO3V))
data_sample$BEHO3V<-as.factor(data_sample$BEHO3V)
BEHO3Vlev <- nlevels(data_sample$BEHO3V)
paste0("Number of factor levels: ",BEHO3Vlev)
table(data_sample$BEHO3V)
table(is.na(data_sample$BEHO3V))


     2      3 
 37762 200695 


 FALSE 
238457 

In [20]:
# Grade
# Code Description
# 1 Grade I; grade i; grade 1; well differentiated; differentiated, NOS
# 2 Grade II; grade ii; grade 2; moderately differentiated; moderately differentiated; intermediate differentiation
# 3 Grade III; grade iii; grade 3; poorly differentiated; differentiated
# 4 Grade IV; grade iv; grade 4; undifferentiated; anaplastic
# 5 T-cell; T-precursor
# 6 B-cell; Pre-B; B-Precursor
# 7 Null cell; Non T-non B;
# 8 N K cell (natural killer cell)
# 9 cell type not determined, not stated or not applicable

# Checking for empty string values
sort(unique(data_sample$GRADE))
data_sample$GRADE[data_sample$GRADE %in% c(9)]<-NA
data_sample$GRADE<-as.factor(data_sample$GRADE)
GRADElev <- nlevels(data_sample$GRADE)
paste0("Number of factor levels: ",GRADElev)
table(data_sample$GRADE)
table(is.na(data_sample$GRADE))


    1     2     3     4 
22828 54097 57167  8174 


 FALSE   TRUE 
142266  96191 

In [21]:
# Extension of Tumor
# Allowable values = 00-99.
# Recode child nodes to parent nodes according to this source
# Page 120 of https://seer.cancer.gov/archive/manuals/EOD10Dig.pub.pdf
# Checking for empty string values
sort(unique(data_sample$EOD10_EX))
# There are multiple 2-space values
data_sample$EOD10_EX_num <- as.numeric(data_sample$EOD10_EX)
data_sample <- mutate(data_sample, 
                       EOD10_EX_str = ifelse(EOD10_EX_num >= 10 & EOD10_EX_num < 20, "10",
                                      ifelse(EOD10_EX_num >= 20 & EOD10_EX_num < 30, "20",
                                      ifelse(EOD10_EX_num >= 30 & EOD10_EX_num < 40, "30",
                                      ifelse(EOD10_EX_num == 0, "00", 
                                      ifelse(EOD10_EX_num == 5, "05",EOD10_EX_num))))))
data_sample$EOD10_EX <- data_sample$EOD10_EX_str
data_sample$EOD10_EX[data_sample$EOD10_EX %in% c(99)]<-NA
data_sample$EOD10_EX<-as.factor(data_sample$EOD10_EX)
EOD10_EXlev <- nlevels(data_sample$EOD10_EX)
paste0("Number of factor levels: ",EOD10_EXlev)
table(is.na(data_sample$EOD10_EX))


 FALSE   TRUE 
173537  64920 

In [22]:
# Lymph node involvement
# Checking for empty string values
# Allowable values = 0-9.
# 0 No lymph node involvement
# 9 UNKNOWN; not stated
sort(unique(data_sample$EOD10_ND))
data_sample$EOD10_ND[data_sample$EOD10_ND %in% c(9,' ')]<-NA
data_sample$EOD10_ND<-as.factor(data_sample$EOD10_ND)
EOD10_NDlev <- nlevels(data_sample$EOD10_ND)
paste0("Number of factor levels: ",EOD10_NDlev)
table(is.na(data_sample$EOD10_ND))


 FALSE   TRUE 
164746  73711 

In [23]:
# Site Specific Surgery Code....priotize SURGPRIF
table(data_sample$SS_SURG)
# Considering https://seer.cancer.gov/seerstat/variables/seer/surgery/
# Code 	Description
# 00 	No surgical procedure
# 09 	Unknown if surgery done
# 90 	Surgery, NOS
# See other code definitions here: https://seer.cancer.gov/archive/manuals/historic/AppendD.pdf



         00    01    02    03    05    07    09    10    18    20    28    30 
81861  3014   321  2103     1     9     3  6820 17186    63 30721    90   535 
   38    40    48    50    58    60    68    70    78    80    88    90    98 
   71  4736  1134 55235  5483   410    14    16     2   388     3 28232     6 

In [24]:
# SURGPRIF - Surgery Primary Syte
# Code Description
# 00 None; no surgical procedure of primary site; diagnosed at
# autopsy only
# 10-19 Site-specific codes. Tumor destruction; no pathologic
# specimen or unknown whether there is a pathologic
# specimen
# 20-80 Site-specific codes. Resection; pathologic specimen
# 90 Surgery, NOS. A surgical procedure to the primary site
# was done, but no information on the type of surgical
# procedure is provided.
# 98 Special codes for hematopoietic, reticuloendothelial,
# immunoproliferative, myeloproliferative diseases; illdefined
# sites; and unknown primaries (See site-specific
#                               codes for the sites and histologies), except death
# certificate only
# 99 Unknown if surgery performed; death certificate only
# See detailed code descriptions here: https://seer.cancer.gov/archive/manuals/AppendC.pdf


In [25]:
# Radiation
# Code  Description
# 0  None; diagnosed at autopsy
# 1  Beam radiation
# 2  Radioactive implants
# 3  Radioisotopes
# 4  Combination of 1 with 2 or 3
# 5  Radiation, NOS - method or source not specified
# 6  Other radiation (1973-1987 cases only)
# 7  Patient or patient's guardian refused radiation therapy
# 8  Radiation recommended, unknown if administered
# 9  Unknown if radiation administered

# Checking for empty string values
sort(unique(data_sample$RADIATN))
data_sample$RADIATN[data_sample$RADIATN %in% c(9)]<-NA
data_sample$RADIATN<-as.factor((data_sample$RADIATN))
RADIATNlev <- nlevels(data_sample$RADIATN)
paste0("Number of factor levels: ",RADIATNlev)
table(is.na(data_sample$RADIATN))


 FALSE   TRUE 
237918    539 

In [26]:
# Stage of Cancer
# Code Description
# 88 N/A
# 90 OCCULT
# 99 UNK Stage
# Checking for empty string values
sort(unique(data_sample$ADJAJCCSTG))
# There are values that have 2 blank spaces (not 1)
data_sample$ADJAJCCSTG[data_sample$ADJAJCCSTG %in% c(88,'  ',99)]<-NA
data_sample$ADJAJCCSTG<-as.factor(data_sample$ADJAJCCSTG)
ADJAJCCSTGlev <- nlevels(data_sample$ADJAJCCSTG)
paste0("Number of factor levels: ",ADJAJCCSTGlev) 
table(is.na(data_sample$ADJAJCCSTG))


 FALSE   TRUE 
163157  75300 

In [27]:
# save this version of data_sample as clean_data_sample if needed to load later
clean_data_sample <- data_sample
save(clean_data_sample, file = "clean_data_sample.rdata")

In [28]:
load("clean_data_sample.rdata")

# data_sample <- clean_data_sample

# Recode SS_SURG and SURGPRIF to a common Site-Specific Surgery Code
ss_surg_surgprif_map <- read.csv("ss_surg_surgprif_map.csv", colClasses=c("character","character"))
str(ss_surg_surgprif_map)

data_sample <- merge(data_sample, ss_surg_surgprif_map, by.x = "SS_SURG", by.y = "CODE",
                     all.x = TRUE)
data_sample <- merge(data_sample, ss_surg_surgprif_map, by.x = "SURGPRIF", by.y = "CODE",
                     all.x = TRUE)
# recoding into one var
data_sample <- mutate(data_sample, SURGCODE = ifelse(SURGCODE.x=="", SURGCODE.y, SURGCODE.x))
# recoding "Unknown" codes into NAs
# 99 - Uknown if surgery was done
data_sample$SURGCODE[data_sample$SURGCODE %in% c("99")] <- NA

data_sample$SURGCODE <- as.factor(data_sample$SURGCODE)
SURGCODElev <- nlevels(data_sample$SURGCODE)

'data.frame':	60 obs. of  2 variables:
 $ CODE    : chr  "00" "01" "02" "03" ...
 $ SURGCODE: chr  "00" "00" "00" "00" ...


In [29]:
# Dataset with NAs included
full_surgcode_data_sample <- data_sample[,c(
  "MAR_STAT"     ,
  "RACE1V"       ,
  "AGE_DX"       ,
  "PRIMSITE"     ,
  "GRADE"        ,
  "EOD10_SZ"     ,
  "EOD10_EX"     , 
  "EOD10_ND"     ,
  "EOD10_PN"     ,
  "PN_PRESENT"   ,
  "EOD10_NE"     , 
  "RADIATN"      , 
  "ADJAJCCSTG"   ,  
  "SEQ_NUM"      ,
  "HISTO3V"      ,
  "BEHO3V"       ,
  "SURGCODE"      ,
  "survived"
)]
paste0("Number of rows in Dataset with NAs included")
nrow(full_surgcode_data_sample)
save(full_surgcode_data_sample, file = "full_surgcode_data_sample.rdata")

In [30]:
#####################################################################################
# Dataset with complete cases (no NAs included)
clean_surgcode_data_sample<-na.omit(data_sample[,c(
  "MAR_STAT"     ,
  "RACE1V"       ,
  "AGE_DX"       ,
  "PRIMSITE"     ,
  "GRADE"        ,
  "EOD10_SZ"     ,
  "EOD10_EX"     , 
  "EOD10_ND"     ,
  "EOD10_PN"     ,
  "PN_PRESENT"   ,
  "EOD10_NE"     , 
  "RADIATN"      , 
  "ADJAJCCSTG"   ,  
  "SEQ_NUM"      ,
  "HISTO3V"      ,
  "BEHO3V"       ,
  "SURGCODE"      ,
  "survived"
)])

paste0("Number of rows in Dataset with complete cases")
nrow(clean_surgcode_data_sample)
save(clean_surgcode_data_sample, file = "clean_surgcode_data_sample.rdata")

In [31]:
# Table 1 - describing the full dataset
data.frame(Nominal_Variable_Name = c("Race", "Marital status", "Primary site code", "Histologic type", 
                                    "Behavior code", "Grade", "Extension of tumor", "Lymph node involvement",
                                    "Presense of positive nodes", "Site specific surgery code", "Radiation", 
                                     "Stage of Cancer"),
          Number_of_Distinct_Values = c(RACE1Vlev, MAR_STATlev, PRIMSITElev,HISTO3Vlev, BEHO3Vlev,
                                       GRADElev, EOD10_EXlev, EOD10_NDlev,PN_PRESENTlev, 
                                        SURGCODElev,RADIATNlev, ADJAJCCSTGlev))

data.frame(Numeric_Variable_Name = c("Age", "Tumor Size", "No of positive nodes", "Number of nodes", "Number of primaries"),
           Mean = c(AGE_DXmean, EOD10_SZmean, EOD10_PNmean, EOD10_NEmean, SEQ_NUMmean),
           StdDev = c(AGE_DXsd, EOD10_SZsd, EOD10_PNsd, EOD10_NEsd, SEQ_NUMsd),
           Range = c(paste(AGE_DXrange, collapse = "-"), paste(EOD10_SZrange, collapse = "-"),
                    paste(EOD10_PNrange, collapse = "-"),paste(EOD10_NErange, collapse="-"),
                    paste(SEQ_NUMrange, collapse="-"))
          )

Nominal_Variable_Name,Number_of_Distinct_Values
Race,12
Marital status,5
Primary site code,9
Histologic type,8
Behavior code,2
Grade,4
Extension of tumor,11
Lymph node involvement,9
Presense of positive nodes,2
Site specific surgery code,8


Numeric_Variable_Name,Mean,StdDev,Range
Age,55.8,13.13,10-106
Tumor Size,20.0,18.63,0-200
No of positive nodes,1.47,3.81,0-75
Number of nodes,9.79,8.79,0-90
Number of primaries,1.13,0.37,1-6
