In [1]:
import pandas as pd
from glob import glob

In [7]:

chrom_files = {
    "2R": "2R_5million.csv",
    "2L": "2L_5million.csv",
    "3R": "3R_5million.csv",
    "3L": "3L_5million.csv",
    "X": "X_5million.csv",
}


In [8]:
chrom_to_sites = {}
for chrom, file in chrom_files.items():
    df = pd.read_csv(file)
    rel_snps = df[df["feature_importance"] > 0]["variant_position"].tolist()
    chrom_to_sites[chrom] = rel_snps

In [5]:
import malariagen_data
import numpy as np
import pandas as pd

ag3 = malariagen_data.Ag3()
df = ag3.sample_metadata().dropna(subset=["sample_id", "aim_species"])
species_list = df['aim_species'].unique().tolist()

# Pick 20 per species for train and 20 for test (or adjust for your data)
N_train = 20
N_test = 20

train_rows = []
test_rows = []
for sp in species_list:
    sp_df = df[df['aim_species'] == sp]
    train_rows.append(sp_df.sample(N_train, random_state=123))
    test_rows.append(sp_df.drop(train_rows[-1].index).sample(N_test, random_state=456))

train_df = pd.concat(train_rows).set_index("sample_id")
test_df = pd.concat(test_rows).set_index("sample_id")
train_ids = train_df.index.tolist()
test_ids = test_df.index.tolist()


                                     

In [9]:
def extract_genotypes_for_sites(ag3, region, sample_ids, snp_sites):
    ds = ag3.snp_calls(region=region, sample_query=f"sample_id in {sample_ids}")
    variant_positions = ds['variant_position'].values
    idx_map = {pos: i for i, pos in enumerate(variant_positions)}
    call_genotype = ds['call_genotype'].values  # (variants, samples, ploidy)
    
    # For only requested snp_sites (order matters)
    valid_sites = [pos for pos in snp_sites if pos in idx_map]
    feature_matrix = []
    for s_idx in range(len(sample_ids)):
        encoded = []
        for pos in valid_sites:
            i = idx_map[pos]
            g0, g1 = call_genotype[i, s_idx, 0], call_genotype[i, s_idx, 1]
            if (g0 < 0) or (g1 < 0):
                encoded.append(np.nan)
            elif (g0 == 0 and g1 == 0):
                encoded.append(0)
            elif (g0 == 1 and g1 == 1):
                encoded.append(2)
            elif (g0 in [0,1] and g1 in [0,1]):
                encoded.append(1)
            else:
                encoded.append(np.nan)
        feature_matrix.append(encoded)
    return np.array(feature_matrix), valid_sites



all_chroms = ['2R','2L','3R','3L','X']
train_X_parts, test_X_parts = [], []
chrom_site_lists = {}

for chrom in all_chroms:
    chrom_sites = chrom_to_sites.get(chrom, [])
    if not chrom_sites:
        continue
    print(f"{chrom}: extracting {len(chrom_sites)} SNPs")
    X_train, sites_train = extract_genotypes_for_sites(ag3, chrom, train_ids, chrom_sites)
    X_test, sites_test = extract_genotypes_for_sites(ag3, chrom, test_ids, chrom_sites)
    assert sites_train == sites_test
    train_X_parts.append(X_train)
    test_X_parts.append(X_test)
    chrom_site_lists[chrom] = sites_train



2R: extracting 20 SNPs
2L: extracting 10 SNPs           
3R: extracting 10 SNPs           
3L: extracting 10 SNPs           
                                 

In [10]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(train_df['aim_species'])
y_test = le.transform(test_df['aim_species'])

clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', tree_method='auto', n_jobs=-1)
clf.fit(X_train, y_train)

# Predict/test
y_pred = clf.predict(X_test)
print("Test set classification report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Test set classification report:
                                  precision    recall  f1-score   support

                      arabiensis       0.39      0.35      0.37        20
                        coluzzii       0.15      0.15      0.15        20
                         gambiae       0.30      0.35      0.33        20
intermediate_gambcolu_arabiensis       0.50      0.50      0.50        20
   intermediate_gambiae_coluzzii       0.47      0.45      0.46        20

                        accuracy                           0.36       100
                       macro avg       0.36      0.36      0.36       100
                    weighted avg       0.36      0.36      0.36       100



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [11]:
# After training
clf.save_model("xgboost_snp_classifier.json")
print("Model saved as xgboost_snp_classifier.json")


Model saved as xgboost_snp_classifier.json
