# Data Parsing

Takes the project data set 'dataset0.json' and formats it for model usage.

## Library

In [1]:
import json
import pandas as pd
import os

## Import

In [2]:
cwd = os.getcwd() # Gets current working directory; ensure that .json file is in the same folder as this script.
path = os.path.join(cwd, 'dataset0.json')

In [3]:
with open(path) as r:
    data = r.read()
    split_data = data.split("\n") # .json file contains multiple JSON objects. Separate by newline.

## Parsing

`split_data`: List of JSON String Objects

JSON Object structure is as follows:
{ Transcript ID : { Middle Position :  { Combined_Nucleotide : [ Read 1, Read 2, ..., Read n ] } } }

Each read consists of 3 sets of: 
- Length of direct RNA-Seq signal of 5-mer nucleotides (dwelling time)
- s.d. of direct RNA-Seq signal
- mean of signal

Objective data structure should be as follows (for each JSON object):
Transcript ID (str) | Nucleotide (str) | Position (str) | Read (list of int)
Each JSON object should give 3 positions; each read is divided into 3 sets of 3 (total 9 values).




In [4]:
rows = []

count = 0
total = len(split_data)

for dp in split_data:
    if dp.strip() == '':
        continue

    obj = json.loads(dp) # Converts into a JSON (dictionary) object. 
    
    transcript_id = next(iter(obj))
    
    position2 = int(next(iter(obj[transcript_id])))
    position1 = position2 - 1
    position3 = position2 + 1

    cmbd_nucleotide = next(iter(obj[transcript_id][str(position2)]))

    nucleotide1 = cmbd_nucleotide[0:5]
    nucleotide2 = cmbd_nucleotide[1:6]
    nucleotide3 = cmbd_nucleotide[2:7]

    for read in obj[transcript_id][str(position2)][cmbd_nucleotide]:
        pos1_read = read[0:3]
        pos2_read = read[3:6]
        pos3_read = read[6:9]

        entry1 = [transcript_id, nucleotide1, position1, pos1_read]
        entry2 = [transcript_id, nucleotide2, position2, pos2_read]
        entry3 = [transcript_id, nucleotide3, position3, pos3_read]

        rows.extend([entry1, entry2, entry3])

    count += 1
    print(f"Object {count} completed. {total - count} left.")

print("Completed!")

Object 1 completed. 121838 left.
Object 2 completed. 121837 left.
Object 3 completed. 121836 left.
Object 4 completed. 121835 left.
Object 5 completed. 121834 left.
Object 6 completed. 121833 left.
Object 7 completed. 121832 left.
Object 8 completed. 121831 left.
Object 9 completed. 121830 left.
Object 10 completed. 121829 left.
Object 11 completed. 121828 left.
Object 12 completed. 121827 left.
Object 13 completed. 121826 left.
Object 14 completed. 121825 left.
Object 15 completed. 121824 left.
Object 16 completed. 121823 left.
Object 17 completed. 121822 left.
Object 18 completed. 121821 left.
Object 19 completed. 121820 left.
Object 20 completed. 121819 left.
Object 21 completed. 121818 left.
Object 22 completed. 121817 left.
Object 23 completed. 121816 left.
Object 24 completed. 121815 left.
Object 25 completed. 121814 left.
Object 26 completed. 121813 left.
Object 27 completed. 121812 left.
Object 28 completed. 121811 left.
Object 29 completed. 121810 left.
Object 30 completed. 12

In [5]:
# Converts to Pandas Dataframe
data_df = pd.DataFrame(rows, columns=['Transcript ID', 'Nucleotide', 'Position', 'Read'])
print(data_df)

            Transcript ID Nucleotide  Position                    Read
0         ENST00000000233      AAGAC       243  [0.00299, 2.06, 125.0]
1         ENST00000000233      AGACC       244   [0.0177, 10.4, 122.0]
2         ENST00000000233      GACCA       245    [0.0093, 10.9, 84.1]
3         ENST00000000233      AAGAC       243  [0.00631, 2.53, 125.0]
4         ENST00000000233      AGACC       244  [0.00844, 4.67, 126.0]
...                   ...        ...       ...                     ...
33081313  ENST00000641834      TGACA      1693  [0.00913, 10.4, 108.0]
33081314  ENST00000641834      GACAT      1694   [0.00664, 4.44, 76.8]
33081315  ENST00000641834      TTGAC      1692  [0.00564, 3.13, 110.0]
33081316  ENST00000641834      TGACA      1693  [0.00303, 9.98, 118.0]
33081317  ENST00000641834      GACAT      1694    [0.0193, 1.79, 76.2]

[33081318 rows x 4 columns]


## Aggregating Features

In [8]:
# Split Read into Dwelling Time, SD, Mean
aggregated_df = data_df

aggregated_df[['Dwelling Time', 'SD', 'Mean']] = pd.DataFrame(aggregated_df['Read'].tolist(), index=aggregated_df.index)

In [9]:
# Create new DataFrame. Features are aggregated by average.
aggregated_df = aggregated_df.groupby(['Transcript ID', 'Position']).agg({
    'Dwelling Time': 'mean',
    'SD': 'mean',
    'Mean': 'mean'
}).reset_index()

In [10]:
aggregated_df

Unnamed: 0,Transcript ID,Position,Dwelling Time,SD,Mean
0,ENST00000000233,243,0.008264,4.223784,123.702703
1,ENST00000000233,244,0.009373,7.382162,125.913514
2,ENST00000000233,245,0.007345,4.386989,80.570270
3,ENST00000000233,260,0.006609,3.216424,109.681395
4,ENST00000000233,261,0.006813,3.226535,107.889535
...,...,...,...,...,...
365509,ENST00000641834,1537,0.007419,6.552982,123.263158
365510,ENST00000641834,1538,0.006472,2.540877,82.289474
365511,ENST00000641834,1692,0.008788,4.090577,105.807692
365512,ENST00000641834,1693,0.006908,8.702885,113.134615


## Export 

In [11]:
# To create a pkl file, for use in other Python as pd.DataFrame.
output_path = os.path.join(cwd, 'parsed_data.pkl')
aggregated_df.to_pickle(output_path)

In [12]:
# Output as CSV file for visual analysis.
output_path = os.path.join(cwd, 'parsed_data.csv')
aggregated_df.to_csv(output_path)