In [2]:
import pandas as pd
import numpy as np


In [3]:
#Load in sample phenotype data
data = pd.read_csv('/Users/carternorton/Desktop/RCC/KIRC_Phenotype.csv', sep=',', header=0, index_col=0)

In [4]:
print(data.columns)

Index(['Mutation Data?', 'Institute', 'Systemic Treatment', 'TC',
       'Age at Diagnosis', 'Race', 'Ethnicity', 'Year of Collection',
       'PriorTumor (Y/N)', 'Presentation', 'Mets at Time of Surgery',
       'Location Mets_time of Surgery', 'Laterality', 'Tumor Grade',
       'Max Tumor Dimension', 'T Stage', 'Pathologic Lymph Nodes',
       'Pathologic Distant Metastasis', 'Tumor Stage',
       'Tumor Status-Tissue Collection', 'Vital Status-Enrollment',
       'Tumor Status-Last Followup', 'Vital Status Followup',
       'Days to New Tumor Event-Followup', 'Recurrence Location(s)',
       'Death From RCC?', 'Days to Last followup',
       'Days to Death-Last Followup', 'KPS', 'ECOG PS', 'HTN', 'DM',
       'Hyperchol', 'BMI', 'Smoking Status', 'CPD', 'Smoking Duration',
       'Year Quit Smoking', 'Cancer Hx (Type)', 'RCC Family Hx', 'Creatinine',
       'CPK-EPI', 'LDH', 'Erythrocyte Sed Rate', 'Calcium', 'White Cell Count',
       'Hemoglobin', 'Platelets', 'Sample Weight', 'T

In [8]:
#Let's isolate bmi 
bmi = data[['BMI',"TC", "Age at Diagnosis"]]

#Rename Age at Diagnosis
bmi.rename(columns={"Age at Diagnosis":"Age"})

#How many samples are missing BMI data?
print(bmi["BMI"].isnull().sum(), "samples are missing BMI data")

#Let's drop those samples
bmi = bmi.dropna()


52 samples are missing BMI data


In [9]:
#Define BMI categories
bmi["bmi"] = np.where(bmi["BMI"] < 25, "normal", np.where(bmi["BMI"] < 30, "overweight", "obese"))

bmi["bmi"] = np.where(bmi["BMI"] < 18.5, "underweight", bmi["bmi"])

bmi.drop(columns=['BMI'], inplace=True)


In [10]:
#Let's select for MRNA data from these patients
mrna = pd.read_csv('/Users/carternorton/Desktop/RCC/xena/TCGA-KIRC.htseq_counts.tsv', sep='\t', header=0, index_col=0)

#Remove any columns that don't end in -01
mrna = mrna[mrna.columns[mrna.columns.str.endswith('-01A')]]

#Now remove the -01 from the column names
mrna.columns = mrna.columns.str.replace('-01A', '')

In [11]:
samples = bmi.index.tolist()

samples = [x for x in samples if x in mrna.columns.tolist()]

print(len(samples), "samples have both BMI and mRNA data")

mrna = mrna[samples]

bmi = bmi.loc[samples]


325 samples have both BMI and mRNA data


In [12]:
#Let's clean up the mrna table
#Remove duplicated mrna indices (both the hugo and entrez ids are duplicated it seems)
mrna = mrna[~mrna.index.duplicated(keep='first')]

#Next, remove all rows that are entirely NaN
print("There are", mrna.shape[0], "rows and", mrna.shape[1], "columns in the mrna dataframe")
mrna = mrna.dropna(how='all')
print("There are now", mrna.shape[0], "rows and", mrna.shape[1], "columns in the mrna dataframe")

There are 60488 rows and 325 columns in the mrna dataframe
There are now 60488 rows and 325 columns in the mrna dataframe


In [13]:
#Let's save bmi data
#Let's replace index with "-" with "."
bmi.index = bmi.index.str.replace('-', '.')
bmi.to_csv('/Users/carternorton/Desktop/RCC/KIRC_BMI.csv', sep=',', header=True, index=True)

In [14]:
#Let's save this data
#Remove duplicated mrna indices
gene_count = len(mrna.index.tolist())
mrna = mrna[~mrna.index.duplicated(keep='first')]
print("There were", gene_count - len(mrna.index.tolist()), "duplicated gene indices")

#Remove any index that starts with __
mrna = mrna[~mrna.index.str.startswith('__')]

#this data was log2 (x+1) transformed, so let's reverse that
mrna = np.power(2, mrna) - 1
#Let's convert to integers
mrna = mrna.astype(int)


There were 0 duplicated gene indices


In [15]:
#Let's save this data
mrna.to_csv('/Users/carternorton/Desktop/RCC/KIRC_mRNA_BMI.csv', sep=',', header=True, index=True)