In [30]:
import json
import pandas as pd
import os
from io import StringIO
import gzip
import json
import numpy as np
from decimal import Decimal, getcontext
# Prepare a list to collect rows

cwd = os.getcwd() # Gets current working directory; ensure that .json file is in the same folder as this script.

path = os.path.join(cwd, 'dataset0.json.gz')
labels_path = os.path.join(cwd, 'data.info.labelled') # replace with your actual file path

aggregated_data = []

# Read the gzipped JSON file line by line
with gzip.open(file_path, 'rt') as f:
    for line in f:
        # Each line is a JSON object
        data = json.loads(line)
        for transcript_id, positions in data.items():
            for pos, flanking_data in positions.items():
                for sequence, reads in flanking_data.items():
                    # Convert the list of reads to a NumPy array for aggregation
                    features_array = np.array(reads)
                    
                    # Split the sequence into three nucleotides
                    nucleotide1 = sequence[0:5]
                    nucleotide2 = sequence[1:6]
                    nucleotide3 = sequence[2:7]

                    # Calculate mean, median, min, and max for each feature
                    aggregated = {
                        'mean': np.mean(features_array, axis=0).tolist(),
                        'median': np.median(features_array, axis=0).tolist(),
                        'min': np.min(features_array, axis=0).tolist(),
                        'max': np.max(features_array, axis=0).tolist()
                    }

                    # Flatten the results into a single row
                    row = {
                        'Transcript ID': transcript_id,
                        'Position': int(pos),
                        'Nucleotide 1': nucleotide1,
                        'Nucleotide 2': nucleotide2,
                        'Nucleotide 3': nucleotide3
                    }

                    # Define the feature names
                    feature_names = [
                        'N1 Length', 'N1 SD', 'N1 Mean',
                        'N2 Length', 'N2 SD', 'N2 Mean',
                        'N3 Length', 'N3 SD', 'N3 Mean'
                    ]
                    
                    for i, feature_name in enumerate(feature_names):
                        row[f'{feature_name} Avg'] = aggregated['mean'][i]
                        row[f'{feature_name} Median'] = aggregated['median'][i]
                        row[f'{feature_name} Min'] = aggregated['min'][i]
                        row[f'{feature_name} Max'] = aggregated['max'][i]

                    aggregated_data.append(row)

# Convert rows to a DataFrame

data_df = pd.DataFrame(aggregated_data)

# Display the DataFrame
print(data_df)


          Transcript ID  Position Nucleotide 1 Nucleotide 2 Nucleotide 3  \
0       ENST00000000233       244        AAGAC        AGACC        GACCA   
1       ENST00000000233       261        CAAAC        AAACT        AACTG   
2       ENST00000000233       316        GAAAC        AAACA        AACAG   
3       ENST00000000233       332        AGAAC        GAACA        AACAT   
4       ENST00000000233       368        AGGAC        GGACA        GACAA   
...                 ...       ...          ...          ...          ...   
121833  ENST00000641834      1348        GGGAC        GGACA        GACAT   
121834  ENST00000641834      1429        CTGAC        TGACA        GACAC   
121835  ENST00000641834      1531        TGGAC        GGACA        GACAC   
121836  ENST00000641834      1537        CTGAC        TGACC        GACCA   
121837  ENST00000641834      1693        TTGAC        TGACA        GACAT   

        N1 Length Avg  N1 Length Median  N1 Length Min  N1 Length Max  \
0            0

In [32]:
final_df = pd.concat([labels_df['gene_id'], data_df, labels_df['label']], axis = 1)
final_df.rename(columns={'gene_id': 'Gene ID', 'label': 'Label'}, inplace=True)
final_df

Unnamed: 0,Gene ID,Transcript ID,Position,Nucleotide 1,Nucleotide 2,Nucleotide 3,N1 Length Avg,N1 Length Median,N1 Length Min,N1 Length Max,...,N3 Length Max,N3 SD Avg,N3 SD Median,N3 SD Min,N3 SD Max,N3 Mean Avg,N3 Mean Median,N3 Mean Min,N3 Mean Max,Label
0,ENSG00000004059,ENST00000000233,244,AAGAC,AGACC,GACCA,0.008264,0.00697,0.00199,0.0339,...,0.0329,4.386989,3.440,0.773,15.50,80.570270,80.50,73.1,88.3,0
1,ENSG00000004059,ENST00000000233,261,CAAAC,AAACT,AACTG,0.006609,0.00564,0.00199,0.0222,...,0.0262,3.016599,2.660,0.715,14.10,94.290698,94.10,88.6,103.0,0
2,ENSG00000004059,ENST00000000233,316,GAAAC,AAACA,AACAG,0.007570,0.00631,0.00232,0.0299,...,0.0266,2.087146,1.910,0.630,6.85,89.364324,89.20,84.4,96.2,0
3,ENSG00000004059,ENST00000000233,332,AGAAC,GAACA,AACAT,0.010620,0.00902,0.00232,0.0370,...,0.0214,2.236520,2.135,0.884,6.49,89.154000,89.90,81.4,95.7,0
4,ENSG00000004059,ENST00000000233,368,AGGAC,GGACA,GACAA,0.010701,0.00896,0.00199,0.0478,...,0.0485,4.260253,4.160,1.040,8.81,85.178788,85.40,77.6,90.5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,ENSG00000167747,ENST00000641834,1348,GGGAC,GGACA,GACAT,0.009594,0.00817,0.00232,0.0432,...,0.0242,4.005616,3.640,1.030,12.50,82.004110,82.10,74.0,92.8,1
121834,ENSG00000167747,ENST00000641834,1429,CTGAC,TGACA,GACAC,0.008393,0.00618,0.00232,0.0262,...,0.0149,3.644638,3.440,1.580,6.50,80.497101,80.60,75.0,90.6,0
121835,ENSG00000167747,ENST00000641834,1531,TGGAC,GGACA,GACAC,0.008161,0.00697,0.00232,0.0315,...,0.0260,2.181562,1.925,1.040,7.19,84.190625,84.50,78.2,88.1,1
121836,ENSG00000167747,ENST00000641834,1537,CTGAC,TGACC,GACCA,0.008044,0.00660,0.00232,0.0483,...,0.0144,2.540877,2.330,1.190,6.50,82.289474,82.00,77.6,87.4,0


In [34]:
output_path = os.path.join(cwd, 'parsed_data_final.csv')
final_df.to_csv(output_path)
