#### This jupyter notebook can be used to clean the data and preper the train, val, test split to be used by the dataloader 

In [18]:
### make the relative imports work in jp
##
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
##
import os
import pandas as pd
from math import isclose
from sklearn.model_selection import train_test_split
from models import AnnotaionLabel

In [None]:
#### config

# input
raw_input_file_path = r".\data\AF50m_subset_REGEX_man_labels_5k.txt"

# output
output_dir = r".\data"
train_filename = r"train_split.tsv"
val_filename = r"val_split.tsv"
test_filename = r"test_split.csv"

## misc
possible_labels = [label.name for label in AnnotaionLabel]

# mapping of the label string in the raw file to our own label
labels_mapping = {
    "low informative": AnnotaionLabel.Low.value, 
    "low": AnnotaionLabel.Low.value,
    "proper": AnnotaionLabel.Proper.value,
    "uninformative": AnnotaionLabel.Uninformative.value
}



In [43]:
# Load data
alldata = pd.read_csv(raw_input_file_path, sep="\t")

In [44]:
# Data sanity checks and some DE
print(f"NA in data:\n{alldata.isna().sum()}")

NA in data:
protein_annotation        2
regex_label               2
manual_label          17103
note                  21998
dtype: int64


In [45]:
### Data cleaning

# We only need the protein_annotation (i.e. X) and manual_label (i.e. y) columns
data_clean_tmp = alldata[["protein_annotation", "manual_label"]]

# Removing all NA values 
_pre_na_len = len(data_clean_tmp)
data_clean_tmp = data_clean_tmp.dropna()
print(f"removed {_pre_na_len - len(data_clean_tmp)} rows that had NA\n")

## Make sure all the labels in the raw data can be mapped 
# all labels in the data
uniqu_labels = set(data_clean_tmp["manual_label"].unique())
# all labels we have mapping for
mapped_labels = set(labels_mapping.keys())
assert uniqu_labels.issubset(mapped_labels), "Some labels in the raw files cannot be mapped!!"

# Map the label strings to our own labels
data_clean_tmp["manual_label"] = data_clean_tmp["manual_label"].replace(labels_mapping)

# keep the clean data
data_clean = data_clean_tmp

print(f"After cleaning \n{data_clean["manual_label"].value_counts()}\n")


removed 17103 rows that had NA

After cleaning 
manual_label
2    4371
0     437
1     192
Name: count, dtype: int64



  data_clean_tmp["manual_label"] = data_clean_tmp["manual_label"].replace(labels_mapping)


In [47]:
# rename columns to fit convention 
data_clean.rename(columns={"protein_annotation" : "X", "manual_label": "y"}, inplace=True)
data_clean

Unnamed: 0,X,y
0,NADPH-dependent 7-cyano-7-deazaguanine reducta...,2
1,Hydrogen peroxide-inducible genes activator > ...,2
2,Scoulerine-9-O-methyltransferase 1,2
3,PadR domain-containing protein,2
4,protein mono-ADP-ribosyltransferase PARP9 isof...,2
...,...,...
4995,Phage-related J or K,0
4996,DUF2156 domain-containing protein,1
4997,Peptidase C14,2
4998,D-arabinose 1-dehydrogenase-like Zn-dependent ...,2


In [49]:
# Split the data 3-way
train_df, val_test_df = train_test_split(
    data_clean,
    test_size=0.3,
    random_state=42,
    stratify=data_clean["y"],
)

val_df, test_df = train_test_split(
    val_test_df,
    test_size=1 / 3,
    random_state=42,
    stratify=val_test_df["y"],
)

print(f"Training: {train_df.shape}")
print(f"Validation: {val_df.shape}")
print(f"Test: {test_df.shape}")


Training: (3500, 2)
Validation: (1000, 2)
Test: (500, 2)


In [50]:
### Save to files

# Define file paths
train_path = os.path.join(output_dir, train_filename)
val_path = os.path.join(output_dir, val_filename)
test_path = os.path.join(output_dir, test_filename)

# Save DataFrames to tab-separated files
train_df.to_csv(train_path, sep="\t", index=False)
val_df.to_csv(val_path, sep="\t", index=False)
test_df.to_csv(test_path, sep="\t", index=False)

print(f"Saved train/val/test files to: {output_dir}")

Saved train/val/test files to: .\data
