# Libraries and Packages

In [1]:
import json
import pandas as pd
import gzip

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import xgboost as xgb
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from imblearn.under_sampling import RandomUnderSampler

# Parse data

In [2]:
file_path = './Data/dataset0.json.gz'
dataset1 = './Data/dataset1.json.gz'
dataset2 = './Data/dataset2.json.gz'

In [3]:
parsed_data_list = []
dataset1_parsed = []
dataset2_parsed = []

# Read and decompress the file, parsing each line as a separate JSON object
with gzip.open(file_path, 'rt', encoding='utf-8') as gzip_file:
    for line in gzip_file:
        try:
            data = json.loads(line)
            # Append the parsed JSON object to the list
            for i in data.keys():
                for j in data[i].keys():
                    for z in data[i][j].keys():                  
                        parsed_data_list.append([i, j, z, data[i][j][z]])
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON on line: {line}")
            
with gzip.open(dataset1, 'rt', encoding='utf-8') as gzip_file:
    for line in gzip_file:
        try:
            data = json.loads(line)
            # Append the parsed JSON object to the list
            for i in data.keys():
                for j in data[i].keys():
                    for z in data[i][j].keys():                  
                        dataset1_parsed.append([i, j, z, data[i][j][z]])
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON on line: {line}")
            
with gzip.open(dataset2, 'rt', encoding='utf-8') as gzip_file:
    for line in gzip_file:
        try:
            data = json.loads(line)
            # Append the parsed JSON object to the list
            for i in data.keys():
                for j in data[i].keys():
                    for z in data[i][j].keys():                  
                        dataset2_parsed.append([i, j, z, data[i][j][z]])
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON on line: {line}")

In [4]:
data = pd.DataFrame(parsed_data_list)
dataset1 = pd.DataFrame(dataset1_parsed)
dataset2 = pd.DataFrame(dataset2_parsed)

data_info = pd.read_csv("./Data/data.info")

# Cleaning data

In [9]:
data = data.rename(columns={0: 'Transcript_ID', 1: 'Position', 2:"Base_seq", 3:"Sample_reads"})
dataset1 = dataset1.rename(columns={0: 'Transcript_ID', 1: 'Position', 2:"Base_seq", 3:"Sample_reads"})
dataset2 = dataset2.rename(columns={0: 'Transcript_ID', 1: 'Position', 2:"Base_seq", 3:"Sample_reads"})

In [10]:
# Calculate mean and std and create another col 'Compiled_reads'
def feature_engin1(rows):
    sample_reads = rows["Sample_reads"]
    output = {"Mean":[],"Standard_deviation":[]}
    matrix = np.array(sample_reads)
    column_means = np.mean(matrix, axis=0)
    column_std = np.std(matrix,axis=0)
    output = [list(column_means),list(column_std)]
    return output

data["Compiled_reads"] = data.apply(feature_engin1,axis=1)
dataset1["Compiled_reads"] = dataset1.apply(feature_engin1,axis=1)
dataset2["Compiled_reads"] = dataset2.apply(feature_engin1,axis=1)
data_info = data_info.rename(columns={'transcript_id': 'Transcript_ID',"transcript_position":"Position"})

In [13]:
data['Position'] = data['Position'].astype('int64')
dataset1['Position'] = dataset1['Position'].astype('int64')
dataset2['Position'] = dataset2['Position'].astype('int64')

data_info['Position'] = data_info['Position'].astype('int64')

In [14]:
# Merging the data_info and data together
merged_right = pd.merge(data_info, data, on=['Transcript_ID',"Position"], how='right')

In [18]:
# Splitting the compiled reads into separate columns
new_columns = merged_right['Compiled_reads'].apply(pd.Series).apply(pd.Series)
new_columns = pd.concat([new_columns[0].apply(pd.Series), new_columns[1].apply(pd.Series)], axis = 1)
new_columns.columns = ['mean1', 'mean2', 'mean3', 'mean4', 'mean5', 'mean6', 'mean7', 'mean8', 'mean9',
              'sd1', 'sd2', 'sd3', 'sd4', 'sd5', 'sd6', 'sd7', 'sd8', 'sd9']

new_columns1 = dataset1['Compiled_reads'].apply(pd.Series).apply(pd.Series)
new_columns1 = pd.concat([new_columns1[0].apply(pd.Series), new_columns1[1].apply(pd.Series)], axis = 1)
new_columns1.columns = ['mean1', 'mean2', 'mean3', 'mean4', 'mean5', 'mean6', 'mean7', 'mean8', 'mean9',
              'sd1', 'sd2', 'sd3', 'sd4', 'sd5', 'sd6', 'sd7', 'sd8', 'sd9']

new_columns2 = dataset2['Compiled_reads'].apply(pd.Series).apply(pd.Series)
new_columns2 = pd.concat([new_columns2[0].apply(pd.Series), new_columns2[1].apply(pd.Series)], axis = 1)
new_columns2.columns = ['mean1', 'mean2', 'mean3', 'mean4', 'mean5', 'mean6', 'mean7', 'mean8', 'mean9',
              'sd1', 'sd2', 'sd3', 'sd4', 'sd5', 'sd6', 'sd7', 'sd8', 'sd9']

In [19]:
merged_right_processed = pd.concat([merged_right,new_columns],axis = 1)
merged_right_processed = merged_right_processed.drop(columns = ["Sample_reads","Compiled_reads"])

merged_right_processed1 = pd.concat([dataset1,new_columns1],axis = 1)
merged_right_processed1 = merged_right_processed1.drop(columns = ["Sample_reads","Compiled_reads"])

merged_right_processed2 = pd.concat([dataset2,new_columns2],axis = 1)
merged_right_processed2 = merged_right_processed2.drop(columns = ["Sample_reads","Compiled_reads"])

In [22]:
train_data = merged_right_processed
test_data1 = merged_right_processed1
test_data2 = merged_right_processed2

# Training

In [39]:
def random_undersample(train_data):
    rus = RandomUnderSampler()
    X = train_data.drop(columns = ["label","gene_id","Transcript_ID","Base_seq"])
    Y = train_data['label'] 
    X_resampled, y_resampled = rus.fit_resample(X, Y)
    output = pd.concat([X_resampled,y_resampled],axis = 1) 
    return output

In [None]:
new_train_data = random_undersample(train_data)

In [47]:
X_train = new_train_data.drop('label', axis=1)
Y_train = new_train_data['label']
X_test = test_data1.drop(columns = ["Transcript_ID","Base_seq"],axis = 1)

model = xgb.XGBClassifier(
    objective='binary:logistic', 
    n_estimators=100,            
    max_depth=3,                
    learning_rate=0.1,          
    subsample=0.8,             
    colsample_bytree=0.8,       
)

model.fit(X_train, Y_train)

Y_pred = model.predict_proba(X_test)

In [48]:
result_df = pd.DataFrame({
    'transcript_id': test_data1['Transcript_ID'],
    'transcript_position': test_data1['Position'],
    'score': Y_pred[:, 1]
})

In [49]:
result_df.to_csv('dataset1_score.csv', index=False)

In [50]:
X_train = new_train_data.drop('label', axis=1)
Y_train = new_train_data['label']
X_test = test_data2.drop(columns = ["Transcript_ID","Base_seq"],axis = 1)

model = xgb.XGBClassifier(
    objective='binary:logistic', 
    n_estimators=100,            
    max_depth=3,                
    learning_rate=0.1,          
    subsample=0.8,             
    colsample_bytree=0.8,       
)

model.fit(X_train, Y_train)

Y_pred = model.predict_proba(X_test)

In [51]:
result_df = pd.DataFrame({
    'transcript_id': test_data2['Transcript_ID'],
    'transcript_position': test_data2['Position'],
    'score': Y_pred[:, 1]
})

In [52]:
result_df.to_csv('dataset2_score.csv', index=False)

In [59]:
X_train = new_train_data.drop('label', axis=1)
Y_train = new_train_data['label']
X_test = train_data.drop(columns = ["gene_id", "label", "Transcript_ID","Base_seq"],axis = 1)

model = xgb.XGBClassifier(
    objective='binary:logistic', 
    n_estimators=100,            
    max_depth=3,                
    learning_rate=0.1,          
    subsample=0.8,             
    colsample_bytree=0.8,       
)

model.fit(X_train, Y_train)

Y_pred = model.predict_proba(X_test)

In [60]:
result_df = pd.DataFrame({
    'transcript_id': train_data['Transcript_ID'],
    'transcript_position': train_data['Position'],
    'score': Y_pred[:, 1]
})

In [61]:
result_df.to_csv('dataset0_score.csv', index=False)