In [1]:
import malariagen_data
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report

In [2]:
ag3 = malariagen_data.Ag3()
df = ag3.sample_metadata().dropna(subset=["sample_id", "aim_species"])
species_list = df['aim_species'].unique().tolist()
N = 5  # samples per species (adjust as needed)
sample_rows = []
for sp in species_list:
    sp_rows = df[df["aim_species"] == sp]
    sample_rows.append(sp_rows.sample(N, random_state=42) if len(sp_rows) >= N else sp_rows)
samples_df = pd.concat(sample_rows).set_index("sample_id")
sample_ids = samples_df.index.tolist()
labels = samples_df['aim_species'].values

                                     

In [3]:
regions = ag3.contigs
regions

('2R', '2L', '3R', '3L', 'X')

In [4]:
regions = [regions[0]]
regions

['2R']

In [5]:
all_encoded = []
all_variant_positions = []

In [6]:
def is_biallelic_site(genos):
    alleles = set(genos.flatten())
    alleles.discard(-1)
    return alleles.issubset({0, 1})

In [7]:
def encode_diploid(gt_slice):
    g0, g1 = gt_slice[:, 0], gt_slice[:, 1]
    encoded = np.full(len(gt_slice), np.nan, dtype=np.float32)
    encoded[(g0 == 0) & (g1 == 0)] = 0
    encoded[((g0 == 0) & (g1 == 1)) | ((g0 == 1) & (g1 == 0))] = 1
    encoded[(g0 == 1) & (g1 == 1)] = 2
    encoded[(g0 < 0) | (g1 < 0)] = np.nan
    return encoded

In [8]:
for region in regions:
    print(f"Processing region: {region}")
    ds = ag3.snp_calls(region="X", sample_query=f"sample_id in {sample_ids}")
    variant_pos = ds['variant_position'].values
    call_genotype = ds['call_genotype'].values
    del ds

    # Biallelic filter
    biallelic_mask = np.array([
        is_biallelic_site(call_genotype[i, :, :])
        for i in range(call_genotype.shape[0])
    ])
    call_genotype_biallelic = call_genotype[biallelic_mask, :, :]
    variant_pos_biallelic = variant_pos[biallelic_mask]
    del call_genotype, variant_pos, biallelic_mask

    # Encode
    encoded = np.array([encode_diploid(call_genotype_biallelic[:, s, :])
                        for s in range(call_genotype_biallelic.shape[1])], dtype=np.float32)
    del call_genotype_biallelic

    all_encoded.append(encoded)
    all_variant_positions.append(variant_pos_biallelic)

Processing region: 2R
                                 

In [9]:
X = np.concatenate(all_encoded, axis=1)
variant_positions_all = np.concatenate(all_variant_positions)
del all_encoded, all_variant_positions

In [10]:
le = LabelEncoder()
y = le.fit_transform(labels)
species_names = le.classes_

In [11]:
clf = XGBClassifier(eval_metric='mlogloss', tree_method='auto', n_jobs=-1)
clf.fit(X, y)


In [12]:
top_n = min(5_000_000,10_000_000)
importances = clf.feature_importances_
important_snps_idx = np.argsort(importances)[::-1][:top_n]
variant_positions_top = variant_positions_all[important_snps_idx]
importances_top = importances[important_snps_idx]
df_top_snps = pd.DataFrame({
    "variant_position": variant_positions_top,
    "feature_importance": importances_top
})
print(len(df_top_snps))
df_top_snps.to_csv("X_5million.csv", index=False)
print(f"Saved {len(df_top_snps)} top SNPs and importances to top_million_snps_all_regions.csv")

5000000
Saved 5000000 top SNPs and importances to top_million_snps_all_regions.csv


In [13]:
del X, importances, important_snps_idx, variant_pos_biallelic, df_top_snps