#### This jupyter notebook can be used to clean the data and preper the train, val, test split to be used by the dataloader 

In [1]:
### make the relative imports work in jp
##
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
##
import os
import pandas as pd
from math import isclose
from sklearn.model_selection import train_test_split
from models.config import AnnotationLabels
from data_loader.config import DataLoaderConfig

In [None]:
#### config

# input
raw_input_file_path = r".\data\AF50m_subset_REGEX_man_labels_5k.txt"

# output
output_dir = DataLoaderConfig.output_dir
train_filename = DataLoaderConfig.train_filename
val_filename = DataLoaderConfig.val_filename
test_filename = DataLoaderConfig.test_filename
all_data_filename = DataLoaderConfig.train_all_filename

# mapping of the label string in the raw file to our own label
labels_mapping = {
    "low informative": AnnotationLabels.label2id["Low"], 
    "low": AnnotationLabels.label2id["Low"],
    "proper": AnnotationLabels.label2id["Proper"],
    "uninformative": AnnotationLabels.label2id["Uninformative"]
}



In [3]:
# Load data
alldata = pd.read_csv(raw_input_file_path, sep="\t")

In [4]:
# Data sanity checks and some DE
print(f"NA in data:\n{alldata.isna().sum()}")

NA in data:
protein_annotation        2
regex_label               2
manual_label          17103
note                  21998
dtype: int64


In [5]:
### Data cleaning

# We only need the protein_annotation (i.e. X) and manual_label (i.e. y) columns
data_clean_tmp = alldata[["protein_annotation", "manual_label"]]

# Removing all NA values 
_pre_na_len = len(data_clean_tmp)
data_clean_tmp = data_clean_tmp.dropna()
print(f"removed {_pre_na_len - len(data_clean_tmp)} rows that had NA\n")

## Make sure all the labels in the raw data can be mapped 
# all labels in the data
unique_labels = set(data_clean_tmp["manual_label"].unique())
# all labels we have mapping for
mapped_labels = set(labels_mapping.keys())
assert unique_labels.issubset(mapped_labels), "Some labels in the raw files cannot be mapped!!"

# Map the label strings to our own labels
## this throws a FutureWarning coulldn't get rid of it even by adding infer_objects
data_clean_tmp["manual_label"] = data_clean_tmp["manual_label"].replace(labels_mapping).astype("int")
# keep the clean data
data_clean = data_clean_tmp

print(f"After cleaning \n{data_clean["manual_label"].value_counts()}\n")


removed 17103 rows that had NA

After cleaning 
manual_label
2    4371
0     437
1     192
Name: count, dtype: int64



  data_clean_tmp["manual_label"] = data_clean_tmp["manual_label"].replace(labels_mapping).astype("int")


In [6]:
# rename columns to fit convention 
data_clean.rename(columns={"manual_label": "label"}, inplace=True)
data_clean

Unnamed: 0,protein_annotation,label
0,NADPH-dependent 7-cyano-7-deazaguanine reducta...,2
1,Hydrogen peroxide-inducible genes activator > ...,2
2,Scoulerine-9-O-methyltransferase 1,2
3,PadR domain-containing protein,2
4,protein mono-ADP-ribosyltransferase PARP9 isof...,2
...,...,...
4995,Phage-related J or K,0
4996,DUF2156 domain-containing protein,1
4997,Peptidase C14,2
4998,D-arabinose 1-dehydrogenase-like Zn-dependent ...,2


In [7]:
# Split the data 3-way
train_df, val_test_df = train_test_split(
    data_clean,
    test_size=0.3,
    random_state=42,
    stratify=data_clean["label"],
)

val_df, test_df = train_test_split(
    val_test_df,
    test_size=1 / 3,
    random_state=42,
    stratify=val_test_df["label"],
)

def describe_split(df, name):
    counts = df["label"].value_counts().sort_index()
    proportions = (counts / len(df)).round(3)
    print(f"{name}: length {len(df)}")
    print(f"  Class distribution: {counts.values}")
    print(f"  Class proportions:  {proportions.values}\n")

describe_split(train_df, "Training")
describe_split(val_df, "Validation")
describe_split(test_df, "Test")


Training: length 3500
  Class distribution: [ 306  134 3060]
  Class proportions:  [0.087 0.038 0.874]

Validation: length 1000
  Class distribution: [ 87  39 874]
  Class proportions:  [0.087 0.039 0.874]

Test: length 500
  Class distribution: [ 44  19 437]
  Class proportions:  [0.088 0.038 0.874]



In [None]:
### Save to files

# Define file paths
train_path = os.path.join(output_dir, train_filename)
val_path = os.path.join(output_dir, val_filename)
test_path = os.path.join(output_dir, test_filename)
all_data_path = os.path.join(output_dir, all_data_filename)

# Save DataFrames to tab-separated files
train_df.to_csv(train_path, sep="\t", index=False)
val_df.to_csv(val_path, sep="\t", index=False)
test_df.to_csv(test_path, sep="\t", index=False)
data_clean.to_csv(all_data_path, sep="\t", index=False)

print(f"Saved train/val/test files to: {output_dir}")

Saved train/val/test files to: c:\Users\caraj\Documents\Master_courses\IBP\data_loader\data
