# Data Parsing

Takes the project data set 'dataset0.json' and formats it for model usage.

## Library

In [1]:
import json
import pandas as pd
import os
from io import StringIO

## Import

In [2]:
cwd = os.getcwd() # Gets current working directory; ensure that .json file is in the same folder as this script.

path = os.path.join(cwd, 'dataset0.json')
labels_path = os.path.join(cwd, 'data.info.labelled')


In [3]:
with open(path) as r:
    data = r.read()
    split_data = data.split("\n") # .json file contains multiple JSON objects. Separate by newline.

split_data

['{"ENST00000000233":{"244":{"AAGACCA":[[0.00299,2.06,125.0,0.0177,10.4,122.0,0.0093,10.9,84.1],[0.00631,2.53,125.0,0.00844,4.67,126.0,0.0103,6.3,80.9],[0.00465,3.92,109.0,0.0136,12.0,124.0,0.00498,2.13,79.6],[0.00398,2.06,125.0,0.0083,5.01,130.0,0.00498,3.78,80.4],[0.00664,2.92,120.0,0.00266,3.94,129.0,0.013,7.15,82.2],[0.0103,3.83,123.0,0.00598,6.45,126.0,0.0153,1.09,74.8],[0.00398,3.75,126.0,0.00332,4.3,129.0,0.00299,1.93,81.9],[0.00498,3.93,127.0,0.00398,2.51,131.0,0.0111,3.47,79.4],[0.0139,4.69,106.0,0.0136,6.21,124.0,0.00531,10.6,85.5],[0.00631,3.5,126.0,0.0222,5.38,128.0,0.00332,1.72,79.3],[0.0061,3.99,121.0,0.0121,7.27,122.0,0.00232,1.27,78.9],[0.00299,1.99,128.0,0.00427,4.85,124.0,0.00332,3.18,80.5],[0.0186,3.62,124.0,0.00428,2.25,129.0,0.00554,2.78,80.1],[0.0093,3.12,125.0,0.00398,8.84,129.0,0.00361,1.86,82.0],[0.00365,2.92,126.0,0.00698,3.7,126.0,0.00467,3.23,80.2],[0.0123,6.68,126.0,0.00854,11.9,123.0,0.00232,1.37,78.4],[0.0123,5.04,106.0,0.0136,9.34,126.0,0.00399,3.28,79.7

In [4]:
with open(labels_path) as r:
    labels = r.read()

labels_df = pd.read_csv(StringIO(labels))
labels_df

Unnamed: 0,gene_id,transcript_id,transcript_position,label
0,ENSG00000004059,ENST00000000233,244,0
1,ENSG00000004059,ENST00000000233,261,0
2,ENSG00000004059,ENST00000000233,316,0
3,ENSG00000004059,ENST00000000233,332,0
4,ENSG00000004059,ENST00000000233,368,0
...,...,...,...,...
121833,ENSG00000167747,ENST00000641834,1348,1
121834,ENSG00000167747,ENST00000641834,1429,0
121835,ENSG00000167747,ENST00000641834,1531,1
121836,ENSG00000167747,ENST00000641834,1537,0


## Parsing

`split_data`: List of JSON String Objects

JSON Object structure is as follows:
{ Transcript ID : { Middle Position :  { Combined_Nucleotide : [ Read 1, Read 2, ..., Read n ] } } }

Each read consists of 3 sets of: 
- Length of direct RNA-Seq signal of 5-mer nucleotides (dwelling time)
- s.d. of direct RNA-Seq signal
- mean of signal

Objective data structure should be as follows (for each JSON object):
Transcript ID (str) | Nucleotide (str) | Position (str) | Read (list of int)




In [5]:
rows = []

count = 0
total = len(split_data)

for dp in split_data:
    if dp.strip() == '':
        continue

    obj = json.loads(dp) # Converts into a JSON (dictionary) object. 
    
    transcript_id = next(iter(obj))
    
    position = int(next(iter(obj[transcript_id])))

    cmbd_nucleotide = next(iter(obj[transcript_id][str(position)]))

    nucleotide1 = cmbd_nucleotide[0:5]
    nucleotide2 = cmbd_nucleotide[1:6]
    nucleotide3 = cmbd_nucleotide[2:7]

    for read in obj[transcript_id][str(position)][cmbd_nucleotide]:
        pos1_read = read[0:3]
        pos2_read = read[3:6]
        pos3_read = read[6:9]

        entry = [transcript_id, position, nucleotide1]
        entry.extend(pos1_read)      
        entry.append(nucleotide2)    
        entry.extend(pos2_read)     
        entry.append(nucleotide3) 
        entry.extend(pos3_read)  

        rows.extend([entry])

    count += 1
    #print(f"Object {count} completed. {total - count} left.")

print("Completed!")

Completed!


In [6]:
# Converts to Pandas Dataframe
data_df = pd.DataFrame(rows, columns=['Transcript ID', 'Position', 'Nucleotide 1', 'N1 Length', 'N1 SD', 'N1 Mean', 'Nucleotide 2', 'N2 Length', 'N2 SD', 'N2 Mean', 'Nucleotide 3', 'N3 Length', 'N3 SD', 'N3 Mean'])
print(data_df)

            Transcript ID  Position Nucleotide 1  N1 Length  N1 SD  N1 Mean  \
0         ENST00000000233       244        AAGAC    0.00299   2.06    125.0   
1         ENST00000000233       244        AAGAC    0.00631   2.53    125.0   
2         ENST00000000233       244        AAGAC    0.00465   3.92    109.0   
3         ENST00000000233       244        AAGAC    0.00398   2.06    125.0   
4         ENST00000000233       244        AAGAC    0.00664   2.92    120.0   
...                   ...       ...          ...        ...    ...      ...   
11027101  ENST00000641834      1693        TTGAC    0.00418   7.49    108.0   
11027102  ENST00000641834      1693        TTGAC    0.00664   1.91    109.0   
11027103  ENST00000641834      1693        TTGAC    0.00721   4.58    105.0   
11027104  ENST00000641834      1693        TTGAC    0.00266   2.33    109.0   
11027105  ENST00000641834      1693        TTGAC    0.00564   3.13    110.0   

         Nucleotide 2  N2 Length  N2 SD  N2 Mean Nu

## Aggregating Features

In [14]:
# Split Read into Dwelling Time, SD, Mean
aggregated_df = data_df

aggregated_df[['Dwelling Time', 'SD', 'Mean']] = pd.DataFrame(aggregated_df['Read'].tolist(), index=aggregated_df.index)

In [8]:
# Create new DataFrame. Features are aggregated by average.
aggregated_df = data_df.groupby(['Transcript ID', 'Position', 'Nucleotide 1', 'Nucleotide 2', 'Nucleotide 3']).agg({
    'N1 Length': ['mean', 'min', 'max', 'median'],
    'N1 SD': ['mean', 'min', 'max', 'median'],
    'N1 Mean': ['mean', 'min', 'max', 'median'],
    'N2 Length': ['mean', 'min', 'max', 'median'],
    'N2 SD': ['mean', 'min', 'max', 'median'],
    'N2 Mean': ['mean', 'min', 'max', 'median'],
    'N3 Length': ['mean', 'min', 'max', 'median'],
    'N3 SD': ['mean', 'min', 'max', 'median'],
    'N3 Mean': ['mean', 'min', 'max', 'median']
}).reset_index()

In [9]:
aggregated_df

Unnamed: 0_level_0,Transcript ID,Position,Nucleotide 1,Nucleotide 2,Nucleotide 3,N1 Length,N1 Length,N1 Length,N1 Length,N1 SD,...,N3 Length,N3 Length,N3 SD,N3 SD,N3 SD,N3 SD,N3 Mean,N3 Mean,N3 Mean,N3 Mean
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,mean,min,max,median,mean,...,max,median,mean,min,max,median,mean,min,max,median
0,ENST00000000233,244,AAGAC,AGACC,GACCA,0.008264,0.00199,0.0339,0.00697,4.223784,...,0.0329,0.005980,4.386989,0.773,15.50,3.440,80.570270,73.1,88.3,80.50
1,ENST00000000233,261,CAAAC,AAACT,AACTG,0.006609,0.00199,0.0222,0.00564,3.216424,...,0.0262,0.006790,3.016599,0.715,14.10,2.660,94.290698,88.6,103.0,94.10
2,ENST00000000233,316,GAAAC,AAACA,AACAG,0.007570,0.00232,0.0299,0.00631,2.940541,...,0.0266,0.006310,2.087146,0.630,6.85,1.910,89.364324,84.4,96.2,89.20
3,ENST00000000233,332,AGAAC,GAACA,AACAT,0.010620,0.00232,0.0370,0.00902,6.476350,...,0.0214,0.004980,2.236520,0.884,6.49,2.135,89.154000,81.4,95.7,89.90
4,ENST00000000233,368,AGGAC,GGACA,GACAA,0.010701,0.00199,0.0478,0.00896,6.415051,...,0.0485,0.008695,4.260253,1.040,8.81,4.160,85.178788,77.6,90.5,85.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,ENST00000641834,1348,GGGAC,GGACA,GACAT,0.009594,0.00232,0.0432,0.00817,3.294164,...,0.0242,0.004650,4.005616,1.030,12.50,3.640,82.004110,74.0,92.8,82.10
121834,ENST00000641834,1429,CTGAC,TGACA,GACAC,0.008393,0.00232,0.0262,0.00618,4.511014,...,0.0149,0.004820,3.644638,1.580,6.50,3.440,80.497101,75.0,90.6,80.60
121835,ENST00000641834,1531,TGGAC,GGACA,GACAC,0.008161,0.00232,0.0315,0.00697,3.918438,...,0.0260,0.005790,2.181563,1.040,7.19,1.925,84.190625,78.2,88.1,84.50
121836,ENST00000641834,1537,CTGAC,TGACC,GACCA,0.008044,0.00232,0.0483,0.00660,3.191228,...,0.0144,0.006310,2.540877,1.190,6.50,2.330,82.289474,77.6,87.4,82.00


## Append Labels

In [13]:
final_df = pd.concat([labels_df['gene_id'], aggregated_df, labels_df['label']], axis = 1)

final_df.columns = [
    'Gene ID', 'Transcript ID', 'Position', 
    'N1', 'N2', 'N3',
    'N1 Avg Length', 'N1 Min Length', 'N1 Max Length', 'N1 Median Length',
    'N1 Avg SD', 'N1 Min SD', 'N1 Max SD', 'N1 Median SD',
    'N1 Avg Mean', 'N1 Min Mean', 'N1 Max Mean', 'N1 Median Mean',
    'N2 Avg Length', 'N2 Min Length', 'N2 Max Length', 'N2 Median Length',
    'N2 Avg SD', 'N2 Min SD', 'N2 Max SD', 'N2 Median SD',
    'N2 Avg Mean', 'N2 Min Mean', 'N2 Max Mean', 'N2 Median Mean',
    'N3 Avg Length', 'N3 Min Length', 'N3 Max Length', 'N3 Median Length',
    'N3 Avg SD', 'N3 Min SD', 'N3 Max SD', 'N3 Median SD',
    'N3 Avg Mean', 'N3 Min Mean', 'N3 Max Mean', 'N3 Median Mean',
    'Label'
]
final_df

Unnamed: 0,Gene ID,Transcript ID,Position,N1,N2,N3,N1 Avg Length,N1 Min Length,N1 Max Length,N1 Median Length,...,N3 Median Length,N3 Avg SD,N3 Min SD,N3 Max SD,N3 Median SD,N3 Avg Mean,N3 Min Mean,N3 Max Mean,N3 Median Mean,Label
0,ENSG00000004059,ENST00000000233,244,AAGAC,AGACC,GACCA,0.008264,0.00199,0.0339,0.00697,...,0.005980,4.386989,0.773,15.50,3.440,80.570270,73.1,88.3,80.50,0
1,ENSG00000004059,ENST00000000233,261,CAAAC,AAACT,AACTG,0.006609,0.00199,0.0222,0.00564,...,0.006790,3.016599,0.715,14.10,2.660,94.290698,88.6,103.0,94.10,0
2,ENSG00000004059,ENST00000000233,316,GAAAC,AAACA,AACAG,0.007570,0.00232,0.0299,0.00631,...,0.006310,2.087146,0.630,6.85,1.910,89.364324,84.4,96.2,89.20,0
3,ENSG00000004059,ENST00000000233,332,AGAAC,GAACA,AACAT,0.010620,0.00232,0.0370,0.00902,...,0.004980,2.236520,0.884,6.49,2.135,89.154000,81.4,95.7,89.90,0
4,ENSG00000004059,ENST00000000233,368,AGGAC,GGACA,GACAA,0.010701,0.00199,0.0478,0.00896,...,0.008695,4.260253,1.040,8.81,4.160,85.178788,77.6,90.5,85.40,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,ENSG00000167747,ENST00000641834,1348,GGGAC,GGACA,GACAT,0.009594,0.00232,0.0432,0.00817,...,0.004650,4.005616,1.030,12.50,3.640,82.004110,74.0,92.8,82.10,1
121834,ENSG00000167747,ENST00000641834,1429,CTGAC,TGACA,GACAC,0.008393,0.00232,0.0262,0.00618,...,0.004820,3.644638,1.580,6.50,3.440,80.497101,75.0,90.6,80.60,0
121835,ENSG00000167747,ENST00000641834,1531,TGGAC,GGACA,GACAC,0.008161,0.00232,0.0315,0.00697,...,0.005790,2.181563,1.040,7.19,1.925,84.190625,78.2,88.1,84.50,1
121836,ENSG00000167747,ENST00000641834,1537,CTGAC,TGACC,GACCA,0.008044,0.00232,0.0483,0.00660,...,0.006310,2.540877,1.190,6.50,2.330,82.289474,77.6,87.4,82.00,0


## Export 

In [14]:
# To create a pkl file, for use in other Python as pd.DataFrame.
output_path = os.path.join(cwd, 'parsed_data.pkl')
final_df.to_pickle(output_path)

In [15]:
# Output as CSV file for visual analysis.
output_path = os.path.join(cwd, 'parsed_data.csv')
final_df.to_csv(output_path)