# Useful Pandas DF Toolkit

### Splits a string Series(col name of df) into substrings.
Series.str.split(pat=None, n=-1, expand=False)

pat = delimiter specifying how to split (default is ' ').

n = # of splits; None, 0 or -1 will split everything.

expand = True (optional - returns a df with a separate col per split, new col names will be int not string!).

https://www.geeksforgeeks.org/python-pandas-split-strings-into-two-list-columns-using-str-split/

### Renames a column or row

df.rename(columns={'Old Col 1': 'New Col 1', 'Old Col 2' : 'New Col 2'}, inplace= True/False) 

inplace = modify the df directly (True) / do not modify original df (False)

### Insert a column

df.insert(loc, column, value)

loc = column location

column = label of the inserted column

value = Scalar, Series, or Array-like (column from different df)

# Library Loading

In [1]:
import pandas as pd
import numpy as np

# Raw Data -> Edit Data

Protein Designations from the MCLP were used as the naming nomenclature. 
All datasets such were manually changed in excel to match the MCLP Nomenlature: uniformity.xlsx.
Note: MCLP dataset from Raw -> Edit was Unchanged but was renamed for consistency purposes.

# Edit Data -> Pandas Data Frame

### CCLE Data (Test Set - 1)

In [2]:
eCCLE = pd.read_csv('Edit Data\Edit_CCLE.csv')
print (eCCLE.columns)
print ('Shape =', eCCLE.shape)
# Splitting and Renaming Columns
Split = eCCLE["Unnamed: 0"].str.split("_", n = 1, expand = True) #Split the Column Name
eCCLE.insert(1, "Sample", Split[0]) 
eCCLE.insert(2, "Cancer Type", Split[1])
eCCLE.drop("Unnamed: 0", axis = 1, inplace = True)
eCCLE.head()
#eCCLE.to_csv("CCLE.csv")

Index(['Unnamed: 0', '1433BETA', '1433EPSILON', '1433ZETA', '4EBP1',
       '4EBP1_pS65', '4EBP1_pT37T46', '4EBP1_pT70', '53BP1', 'ARAF_pS299',
       ...
       'TUBERIN_pT1462', 'VAV1', 'VEGFR2', 'VHL', 'XBP1', 'XRCC1', 'YAP',
       'YAP_pS127', 'YB1', 'YB1_pS102'],
      dtype='object', length=215)
Shape = (899, 215)


Unnamed: 0,Sample,Cancer Type,1433BETA,1433EPSILON,1433ZETA,4EBP1,4EBP1_pS65,4EBP1_pT37T46,4EBP1_pT70,53BP1,...,TUBERIN_pT1462,VAV1,VEGFR2,VHL,XBP1,XRCC1,YAP,YAP_pS127,YB1,YB1_pS102
0,DMS53,LUNG,-0.104888,0.060414,0.309068,-0.075506,0.230359,0.198304,-0.030541,0.455889,...,-0.099433,-0.486715,-1.147858,0.133876,-0.075812,-0.144388,-1.090303,-2.109324,0.178104,0.246541
1,SW1116,LARGE_INTESTINE,0.358504,-0.180291,-0.041237,-0.286629,-0.877406,-1.026948,-0.462761,-0.011197,...,-0.109777,0.34933,0.770148,0.984297,-0.168138,-0.004905,0.189294,-0.283593,0.255972,-0.121134
2,NCIH1694,LUNG,0.028738,0.071902,-0.094847,0.285069,1.321551,0.620703,-0.439484,0.195007,...,0.154344,-0.478189,-1.18553,1.273013,-0.240413,0.476633,-1.367465,-2.525695,-0.13788,-0.451282
3,P3HR1,HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.120039,-0.066802,-0.128007,-0.552081,-0.292428,-1.415935,-0.138858,-0.066122,...,0.040106,5.92383,-3.893832,-2.499188,0.632758,0.025639,-1.18918,-3.056863,0.025997,-0.465205
4,HUT78,HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,-0.268997,-0.060281,-0.137881,-0.398729,-0.095622,-0.533905,0.054245,-0.573022,...,-0.466919,5.47588,-0.561973,-0.500953,-0.261494,0.358679,-0.951686,-3.247388,-0.151424,-0.145426


### TCGA Data (Test Set - 2)

In [4]:
eTCGA = pd.read_csv('Edit Data\Edit_TCGA-PANCAN32.csv')
eTCGA.rename(columns = {'Sample_ID' : 'Sample', 'Cancer_Type' : 'Cancer Type'}, inplace = True) #rename Sample_Name to Sample
print (eTCGA.columns)
print ('Shape =', eTCGA.shape)
##eTCGA.to_csv("TCGA.csv")

Index(['Sample', 'Cancer Type', 'Sample_Type', '1433EPSILON', '4EBP1',
       '4EBP1_pS65', '4EBP1_pT37T46', '53BP1', 'ACC_pS79', 'ACC1',
       ...
       'PARPAB3', 'THYMIDILATESYNTHASE', 'TTF1', 'CHROMOGRANINANTERM', 'CK5',
       'NAPSINA', 'P63', 'RET_pY905', 'SYNAPTOPHYSIN', 'ALPHACATENIN'],
      dtype='object', length=261)
Shape = (7694, 261)


### MCLP Data (Training / Validation Set)

In [None]:
#No changes in protein names generated from raw to edit MCLP data. Used as a template. 
#Only changed to edit as a deep copy to match the location of the new CCLE and the TCGA datasets.
eMCLP = pd.read_csv('Edit Data\Edit_MCLP.csv')
rMCLP = pd.read_csv('Raw Data\MCLP-v1.1-Level4.csv')
eMCLP.columns == rMCLP.columns

The MCLP dataset was modified:
1) Renamed the sample_name column -> Sample
2) Add a cancer type column and filled with CCLE classification
    a) Ran code below to fill in matched MCLP to CCLE cell lines 
    b) manually filled the rest with cellosaurus after


In [None]:
# Designating Cancer Types based on CCLE Classifications
# Nested code below iterates through the MCLP Sample column and compares it to every CCLE Sample column
# If the values match, the MCLP Cancer Type will be replaced with the CCLE Cancer Type
for i in range(len(eMCLP)): 
    for j in range(len(eCCLE)):
        if eMCLP.iloc[i, 0] == eCCLE.iloc[j, 0]:
            eMCLP.iloc[i, 1] = eCCLE.iloc[j, 1]
            break #allows exiting of the inner for loop
        else:
            eMCLP.iloc[i, 1] = "Undefined"
            
#converted back to csv using pd.to_csv and manually input the rest of the undefined tumor indications

In [5]:
MCLP = pd.read_csv('MCLP.csv')
print (MCLP.shape)
print (MCLP['Cancer Type'].value_counts())

(651, 454)
Cancer Type
LUNG                                  126
HAEMATOPOIETIC_AND_LYMPHOID_TISSUE     96
BREAST                                 58
OVARY                                  50
UPPER_AERODIGESTIVE_TRACT              47
SKIN                                   44
LARGE_INTESTINE                        36
ENDOMETRIUM                            29
KIDNEY                                 27
BONE                                   27
Undefined                              24
PANCREAS                               22
SOFT_TISSUE                            18
LIVER                                  17
URINARY_TRACT                          11
STOMACH                                10
CENTRAL_NERVOUS_SYSTEM                  5
PROSTATE                                2
OESOPHAGUS                              2
Name: count, dtype: int64


# Unifying Datasets to CCLE
The CCLE dataset will be used to unify the data as it has the lowest number of protein expression data in the RPPA experiment.