# Data Processing Pipeline
## Processes Cosmic Data into form usable for analysis

In [1]:
import pandas as pd

In [34]:
# Reads file from cosmic and converts to simple (position,frequency) tuples
# @fileName: name of file, @fill: add (pos,0) entries for all missing data
def getFreq(fileName, fill):
    df = pd.read_csv(fileName)
    df = df.dropna(subset=[' Mutation genome position']) # Remove any rows w/o position data
    df = df[' Mutation genome position'].apply(lambda x: x.split(':')[1].split('-')[0])
    
    # Create Dictionary
    freqDic = dict((pos,0) for pos in df.unique()) 
    for pos in df.tolist():
        freqDic[pos]+=1
        
    if fill:
        # Fill in data with 0's
        sortedPos = sorted(freqDic.keys())
        lastPos = int(sortedPos[0])
        for currPos in sortedPos:
            currPos = int(currPos)

            # Add (key,0) from (lastPos + 1) to currPos
            if(currPos - lastPos > 1):
                for i in range(lastPos+1,currPos):
                    freqDic[str(i)] = 0

            lastPos = currPos
    
    # Output CSV
    s = pd.Series(freqDic)
    s.index.name = 'Position'
    s.reset_index()
    if fill:
        s.to_csv('cleanedData/' + fileName.split('_')[0] + '_filled.csv')
    else:
        s.to_csv('cleanedData/' + fileName.split('_')[0] + '.csv')

In [36]:
# Obtain (position,frequency csv files)
getFreq('ARID1A_data.csv', False)
getFreq('ARID1B_data.csv', False)
getFreq('ARID2_data.csv', False)

getFreq('ARID1A_data.csv', True)
getFreq('ARID1B_data.csv', True)
getFreq('ARID2_data.csv', True)