# Creating Sample File For Toxicity Endpoints 

In [None]:
#import libraries and cohort csv files 
import pandas as pd
import numpy as np 
pd.set_option('display.max_rows', 100)

ad_ids_df = pd.read_csv('/Users/ceilidhwelsh/Documents/CambridgePhD/GWAS/ArtDecoResults/ART DECO-RAPPER IDs.csv')
nim_ids_df = pd.read_csv('/Users/ceilidhwelsh/Documents/CambridgePhD/GWAS/NIMRADResults/NIMRAD IDs_May 5th 21.csv')
ad_covariables_df = pd.read_csv('/Users/ceilidhwelsh/Documents/CambridgePhD/GWAS/ArtDecoResults/ADFinalEndpointValues.csv')
nim_covariables_df = pd.read_csv('/Users/ceilidhwelsh/Documents/CambridgePhD/GWAS/NIMRADResults/NIMRADFinalEndpointValues.csv')
voxtox_residuals = pd.read_csv('/Users/ceilidhwelsh/Documents/CambridgePhD/GWAS/VoxToxResults/FinalVoxToxEndpointValues.csv')
genotyping_orders = pd.read_csv('/Users/ceilidhwelsh/Documents/CambridgePhD/GWAS/GeneticsFiles/PatientOrderGeneticsFiles.csv')
genotypingorders2 = pd.read_csv("/Users/ceilidhwelsh/Documents/CambridgePhD/GWAS/NIMRADResults/chromosone10patientID's.csv")

In [None]:
#Check to see if the patient ID and patient_ID order are the same for two independent chromosone txt files (10 & 22) 
test = genotyping_orders[genotyping_orders['RAP3300'].isin(genotypingorders2['SampleName'])]
test

## ARTDECO Cohort 
- for each patient in the AD cohort: 
    - convert trial identifier into RAPPER ID used in genotyping
    - add variable to identify this cohort 

In [None]:
# Drop and rename columns of the ARTDECO covariables 
ad_covariables_df.drop(['Unnamed: 0'], axis=1, inplace=True)
ad_covariables_df.rename({'PATIENT_ID':'Trial Identifier'}, axis=1, inplace=True)

In [None]:
ad_covariables_df

In [None]:
# merge covariables df with the RAPPER ID's for ARTDECO patient 
# replace and NaN values in the RAPPER No. Column 
final_AD_residuals = pd.merge(ad_covariables_df, ad_ids_df, how='outer', on=['Trial Identifier'])
final_AD_residuals = final_AD_residuals.replace(np.nan,'NAN')
final_AD_residuals

In [None]:
final_AD_residuals.columns

In [None]:
# Reformat the RAPPEr IDs 
# Remove the placeholder 0 after RAP in the RAPPER ID's 

def fix_ad_ids(rapperids):
    rapperids = list(rapperids)

    if rapperids[0] =='N':
        new_id = np.nan
    elif rapperids[3] == '0':
        new_id = rapperids[0]+rapperids[1]+rapperids[2]+rapperids[4]+rapperids[5]+rapperids[6]+rapperids[7]

    else: 
        new_id = rapperids[0]+rapperids[1]+rapperids[2]+rapperids[3]+rapperids[4]+rapperids[5]+rapperids[6]+rapperids[7]

    return new_id

final_AD_residuals['PATIENT_ID'] = final_AD_residuals.apply(lambda x: fix_ad_ids(x['RAPPER      No. ']), axis=1)

In [None]:
final_AD_residuals

In [None]:
# Reorder variables in the dataframe and add covariable for each trial (center)
final_AD_residuals = final_AD_residuals[['PATIENT_ID','STAT_SCORE','DYS_AUC','XERO_AUC','MUCO_AUC', 'Trial Identifier']].copy()
final_AD_residuals['center'] = 2
final_AD_residuals

## NIMRAD Cohort 
- for each patient in the NIMRAD cohort: 
    - adjust the trial identifier into RAPPER ID used in genotyping
    - add variable to identify this cohort 


In [None]:
# Create function to edit the NIMRAD Trial ID to match the IDs in the covariables spreadsheet 

def edit_trial_id(trial_id):
    trial_id = list(trial_id)
    
    if trial_id[0] == '0':
        new_id = trial_id[1]+trial_id[3]+trial_id[4]+trial_id[5]
        new_id = int(new_id)
    else:
        new_id = trial_id[0]+trial_id[1]+trial_id[3]+trial_id[4]+trial_id[5]
        new_id = int(new_id)
    return new_id

nim_ids_df['new_id'] = nim_ids_df.apply(lambda x: edit_trial_id(x['Trial ID']), axis=1)
nim_ids_df

In [None]:
nim_covariables_df

In [None]:
#Rename and drop columns 
nim_covariables_df.drop(['Unnamed: 0'], axis=1, inplace=True)
nim_covariables_df.rename({'PATIENT_ID':'new_id'}, axis=1, inplace=True)

In [None]:
#Create a new dataframe that includes the patient ID's and their residuals 
final_NIM_residuals = pd.merge(nim_covariables_df, nim_ids_df, how='outer', on=['new_id'])

In [None]:
final_NIM_residuals

In [None]:
#Reorder the dataframe and add a center covariable
#final_NIM_residuals.drop(['Trial ID'], axis=1, inplace=True)
final_NIM_residuals['center'] = 3
final_NIM_residuals = final_NIM_residuals[['Sample ID','STAT_SCORE','DYS_AUC',
                                           'XERO_AUC','MUCO_AUC','new_id','center']]
final_NIM_residuals

In [None]:
final_AD_residuals.rename({'RAPPER      No. ': 'PATIENT_ID'}, axis=1, inplace=True)
final_NIM_residuals.rename({'Sample ID': 'PATIENT_ID', 'new_id':'Trial Identifier'}, axis=1, inplace=True)

In [None]:
final_AD_residuals

In [None]:
final_AD_residuals = final_AD_residuals.drop_duplicates(subset=['Trial Identifier'])
final_AD_residuals

In [None]:
final_NIM_residuals

In [None]:
NIM_AD_df = pd.concat([final_AD_residuals, final_NIM_residuals], axis=0)

In [None]:
NIM_AD_df.to_csv('checkNIMandAD.csv')

In [None]:
NIM_AD_df.dropna()

## VoxTox Patients with RAPPER IDs
- for each patient in the VoxTox cohort: 
    - convert trial identifier into RAPPER ID used in genotyping, and reformat by removing placeholder 0 
    - add variable to identify this cohort 

In [None]:
#Drop columns, reorder dataframe, rename appropriate columns, and add center covariables
voxtox_residuals.drop(['Unnamed: 0'], axis=1, inplace=True)
voxtox_residuals = voxtox_residuals[['RAPPER_ID', 'STAT_SCORE', 'DYS_AUC', 'XERO_AUC',
                                    'MUCO_AUC', 'PATIENT_ID']]
#voxtox_residuals['center'] = 1

In [None]:
# Rename patient identifier columns to match with NIMRAD and AD for merging 
voxtox_residuals.rename({'PATIENT_ID':'Trial Identifier', 'RAPPER_ID':'PATIENT_ID'},axis=1, inplace=True)
voxtox_residuals

In [None]:
# Convert the voxtox rapper ID to the correct format by removing the placeholder 0

def convert_voxtox_rap(patient_id):
    patient_id = list(patient_id)
    
    if patient_id[3] =='0':
        new_id = patient_id[0]+patient_id[1]+patient_id[2]+patient_id[4]+patient_id[5]+patient_id[6]+patient_id[7]
    print(new_id)
    
    return new_id

voxtox_residuals['new_id'] = voxtox_residuals.apply(lambda x: convert_voxtox_rap(x['PATIENT_ID']), axis=1)

In [None]:
voxtox_residuals['PATIENT_ID'] = voxtox_residuals['new_id']

In [None]:
voxtox_residuals.drop(['new_id'], axis=1, inplace=True)

In [None]:
# Add center variable for VoxTox patients 
voxtox_residuals['center'] = 1.0
voxtox_residuals

In [None]:
# Create final dataframe by merging patients from all three trial dfs 
all_patients_residuals_df = pd.concat([voxtox_residuals, NIM_AD_df], axis=0)
all_patients_residuals_df

In [None]:
# Rename columns in the genotyping files
genotyping_orders.rename({'RAP3300':'PATIENT_ID'},axis=1, inplace=True)
genotypingorders2.rename({'SampleName':'PATIENT_ID'}, axis=1, inplace=True)

# Check which patients are in the genotyping file and only keep those patients 
all_patients_residuals_df = (all_patients_residuals_df[all_patients_residuals_df['PATIENT_ID'].isin(genotypingorders2['PATIENT_ID'])])
all_patients_residuals_df

In [None]:
#Merge the order of the genetic patients with only the patients in the genotyping file 
ordered_df = pd.merge(genotypingorders2, all_patients_residuals_df, how='outer', on=['PATIENT_ID'])
ordered_df

In [None]:
ordered_df.drop(['Unnamed: 1'], axis=1, inplace=True)

In [None]:
ordered_df

In [None]:
#ordered_df.to_csv('patientsingeneticsorder.csv')

## Add the Principle Components 
- The principle components adjust for population substructure and are included in the final analysis 
- Add the PC's for each patient identifier for all three trials 
- Also add the CEU column from genotyping 

In [None]:
onc_ids_df = pd.read_csv('/Users/ceilidhwelsh/Documents/CambridgePhD/GWAS/GeneticsFiles/GenotypingIDs.csv')
eth_df = pd.read_csv('/Users/ceilidhwelsh/Documents/CambridgePhD/GWAS/GeneticsFiles/ethnicities.csv')
pc_df = pd.read_csv('/Users/ceilidhwelsh/Documents/CambridgePhD/GWAS/GeneticsFiles/rapper_nimrad_euro_pcs_withIDs.csv')

In [None]:
eth_df.rename({'Onc_ID': 'Sample_id'}, axis=1, inplace=True)
eur_eth_df = eth_df[['Sample_id', 'CEU']].copy()
eur_eth_df

In [None]:
pc_df['Sample_id'] = pc_df['Sample_id'].astype(str)
pc_df

In [None]:
pc_df = pd.merge(pc_df ,eur_eth_df , how='outer', on=['Sample_id'])
pc_df

In [None]:
pc_df = pc_df[pc_df['SampleName'].isin(ordered_df['PATIENT_ID'])].copy()

In [None]:
pc_df

In [None]:
pc_df.columns

In [None]:
pc_df.drop(['Sample_id', 'Subject_id', 'Missing', 'Gender', 'Consortium', 'SampleWell', 'SamplePlate',
           'pc11', 'pc12', 'pc13', 'pc14', 'pc15'], axis=1, inplace=True)

In [None]:
pc_df.rename({'SampleName':'PATIENT_ID'}, axis=1, inplace=True)

In [None]:
pc_df

In [None]:
inc_pc_df = pd.merge(ordered_df, pc_df, how='outer', on=['PATIENT_ID'])

In [None]:
inc_pc_df = inc_pc_df.replace('NAN', np.nan)

In [None]:
inc_pc_df

## Check Ethnicity >80% European for this H&N GWAS 
- for this GWAS study only patients with >80% Eurpoean ancestry are included to stop population substructure 
- use the CEU variable to exclude any patients <0.8 
- replace these rows 

In [None]:
inc_pc_df.set_index('PATIENT_ID', inplace=True, drop=True)
inc_pc_df.loc[inc_pc_df['CEU'] < 0.8] = np.nan

In [None]:
inc_pc_df

In [None]:
inc_pc_df.reset_index(inplace=True)

In [None]:
inc_pc_df

In [None]:
inc_pc_df.dropna()

## Create Final Sample File 
- Remove Identifiers (for github) 
- include all four endpoints, PCs, center variable 

In [None]:
inc_pc_df.fillna("NA", inplace=True)
inc_pc_df.rename({'PATIENT_ID':'ID_1'}, axis=1, inplace=True)
inc_pc_df['ID_2'] = inc_pc_df['ID_1']
inc_pc_df['missing'] = 0

In [None]:
inc_pc_df = inc_pc_df.drop(['ID_1', 'ID_2', 'Trial Identifier'], axis=1)
inc_pc_df.to_csv('/Users/ceilidhwelsh/Documents/CambridgePhD/GWAS/GeneticsFiles/FinalSampleFiles/ExampleEndpointSampleFile.csv')
inc_pc_df

## For Final Sample File in SNPTEST
- include an initial row that tells you the variable type
- include the patient identifier to align with the patient identifiers in the genotyping .txt file 
- split into four independent dfs and save to csv files for each endpoints (STAT, dysphagia, mucositis, xerostomia)

In [None]:
#Insert row at the top of the df (not column header) to indicate variable type 

def Insert_row_(row_number, df, row_value):
    # Slice the upper half of the dataframe
    df1 = df[0:row_number]
   
    # Store the result of lower half of the dataframe
    df2 = df[row_number:]
   
    # Inser the row in the upper half dataframe
    df1.loc[row_number]=row_value
   
    # Concat the two dataframes
    df_result = pd.concat([df1, df2])
   
    # Reassign the index labels
    df_result.index = [*range(df_result.shape[0])]
   
    # Return the updated dataframe
    return df_result


row_number = 0
row_value = ['0', 'P', 'P', 'P', 'P', 'D', 'D','D', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', '0', '0']
  
if row_number > inc_pc_df.index.max()+1:
    print("Invalid row_number")
else:
  
    # Let's call the function and insert the row
    # at the second position
    inc_pc_df = Insert_row_(0, inc_pc_df, row_value)
  
    # Print the updated dataframe
inc_pc_df

In [None]:
statscore_sample_df = inc_pc_df[['ID_1', 'ID_2', 'missing','STAT_SCORE','center', 'pc1', 'pc2', 'pc3',
       'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10']]
dys_endpoint_sample_df = inc_pc_df[['ID_1', 'ID_2', 'missing','DYS_AUC', 'center', 'pc1', 'pc2', 'pc3',
       'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10']]
xero_endpoint_sample_df = inc_pc_df[['ID_1', 'ID_2', 'missing','XERO_AUC','center', 'pc1', 'pc2', 'pc3',
       'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10']]
muco_endpoint_sample_df = inc_pc_df[['ID_1', 'ID_2', 'missing','MUCO_AUC','center', 'pc1', 'pc2', 'pc3',
       'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10']]

In [None]:
statscore_sample_df.to_csv("/Users/ceilidhwelsh/Documents/CambridgePhD/GWAS/Final/SampleFiles/endpointstatacutephenotypes.sample", sep=' ', index=False)

In [None]:
dys_endpoint_sample_df.to_csv("/Users/ceilidhwelsh/Documents/CambridgePhD/GWAS/Final/SampleFiles/endpointdysphagiaphenotypes.sample", sep=' ', index=False)
xero_endpoint_sample_df.to_csv("/Users/ceilidhwelsh/Documents/CambridgePhD/GWAS/Final/SampleFiles/endpointxerostomiaphenotypes.sample", sep=' ', index=False)
muco_endpoint_sample_df.to_csv("/Users/ceilidhwelsh/Documents/CambridgePhD/GWAS/Final/SampleFiles/endpointmucositisphenotypes.sample", sep=' ', index=False)