# Data Parsing
Takes the project data set `dataset1.json`, `dataset2.json` and formats it for model usage.

In [5]:
import json
import pandas as pd
import os
from io import StringIO
import gzip
import json
import numpy as np
from decimal import Decimal, getcontext
# Prepare a list to collect rows

cwd = os.getcwd() # Gets current working directory; ensure that .json file is in the same folder as this script.

path = os.path.join(cwd, 'dataset2.json.gz')

aggregated_data = []

# Read the gzipped JSON file line by line
with gzip.open(path, 'rt') as f:
    for line in f:
        # Each line is a JSON object
        data = json.loads(line)
        for transcript_id, positions in data.items():
            for pos, flanking_data in positions.items():
                for sequence, reads in flanking_data.items():
                    # Convert the list of reads to a NumPy array for aggregation
                    features_array = np.array(reads)
                    
                    # Split the sequence into three nucleotides
                    nucleotide1 = sequence[0:5]
                    nucleotide2 = sequence[1:6]
                    nucleotide3 = sequence[2:7]

                    # Calculate mean, median, min, and max for each feature
                    aggregated = {
                        'mean': np.mean(features_array, axis=0).tolist(),
                        'median': np.median(features_array, axis=0).tolist(),
                        'min': np.min(features_array, axis=0).tolist(),
                        'max': np.max(features_array, axis=0).tolist()
                    }

                    # Flatten the results into a single row
                    row = {
                        'Transcript ID': transcript_id,
                        'Position': int(pos),
                        'Nucleotide 1': nucleotide1,
                        'Nucleotide 2': nucleotide2,
                        'Nucleotide 3': nucleotide3
                    }

                    # Define the feature names
                    feature_names = [
                        'N1 Length', 'N1 SD', 'N1 Mean',
                        'N2 Length', 'N2 SD', 'N2 Mean',
                        'N3 Length', 'N3 SD', 'N3 Mean'
                    ]
                    
                    for i, feature_name in enumerate(feature_names):
                        row[f'{feature_name} Avg'] = aggregated['mean'][i]
                        row[f'{feature_name} Median'] = aggregated['median'][i]
                        row[f'{feature_name} Min'] = aggregated['min'][i]
                        row[f'{feature_name} Max'] = aggregated['max'][i]

                    aggregated_data.append(row)

# Convert rows to a DataFrame

data_df = pd.DataFrame(aggregated_data)

# Display the DataFrame
print(data_df)


     Transcript ID  Position Nucleotide 1 Nucleotide 2 Nucleotide 3  \
0          tx_id_0         0        AAAAC        AAACC        AACCT   
1          tx_id_0        10        TGGAC        GGACC        GACCC   
2          tx_id_0        20        GGGAC        GGACT        GACTA   
3          tx_id_0        30        TGGAC        GGACC        GACCA   
4          tx_id_0        40        TAGAC        AGACT        GACTA   
...            ...       ...          ...          ...          ...   
1318       tx_id_6      1840        CGAAC        GAACC        AACCT   
1319       tx_id_6      1850        CAAAC        AAACA        AACAG   
1320       tx_id_6      1860        CGGAC        GGACA        GACAG   
1321       tx_id_6      1870        CAGAC        AGACA        GACAA   
1322       tx_id_6      1880        CGAAC        GAACA        AACAA   

      N1 Length Avg  N1 Length Median  N1 Length Min  N1 Length Max  \
0          0.007331           0.00571        0.00166         0.0345   
1    

In [6]:
output_path = os.path.join(cwd, 'parsed_data_2.csv')
data_df.to_csv(output_path)
