In [1]:
import malariagen_data
import dask.array as da
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from dask.distributed import Client
from dask_ml.decomposition import PCA
from dask_ml.preprocessing import StandardScaler
from dask_ml.impute import SimpleImputer

In [2]:
client = Client() 
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 5
Total threads: 15,Total memory: 115.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:45365,Workers: 5
Dashboard: http://127.0.0.1:8787/status,Total threads: 15
Started: Just now,Total memory: 115.00 GiB

0,1
Comm: tcp://127.0.0.1:38667,Total threads: 3
Dashboard: http://127.0.0.1:33769/status,Memory: 23.00 GiB
Nanny: tcp://127.0.0.1:36645,
Local directory: /tmp/dask-scratch-space/worker-zm_yh8h1,Local directory: /tmp/dask-scratch-space/worker-zm_yh8h1

0,1
Comm: tcp://127.0.0.1:44165,Total threads: 3
Dashboard: http://127.0.0.1:37295/status,Memory: 23.00 GiB
Nanny: tcp://127.0.0.1:41057,
Local directory: /tmp/dask-scratch-space/worker-hv5ozjjr,Local directory: /tmp/dask-scratch-space/worker-hv5ozjjr

0,1
Comm: tcp://127.0.0.1:36223,Total threads: 3
Dashboard: http://127.0.0.1:33483/status,Memory: 23.00 GiB
Nanny: tcp://127.0.0.1:35885,
Local directory: /tmp/dask-scratch-space/worker-2kuu0crq,Local directory: /tmp/dask-scratch-space/worker-2kuu0crq

0,1
Comm: tcp://127.0.0.1:43399,Total threads: 3
Dashboard: http://127.0.0.1:33657/status,Memory: 23.00 GiB
Nanny: tcp://127.0.0.1:40445,
Local directory: /tmp/dask-scratch-space/worker-99oyukbz,Local directory: /tmp/dask-scratch-space/worker-99oyukbz

0,1
Comm: tcp://127.0.0.1:38963,Total threads: 3
Dashboard: http://127.0.0.1:45427/status,Memory: 23.00 GiB
Nanny: tcp://127.0.0.1:44725,
Local directory: /tmp/dask-scratch-space/worker-4xahquxk,Local directory: /tmp/dask-scratch-space/worker-4xahquxk


In [3]:
ag3 = malariagen_data.Ag3()

In [4]:
sample_sets = ag3.sample_sets().sample_set.tolist()
sample_sets

['AG1000G-AO',
 'AG1000G-BF-A',
 'AG1000G-BF-B',
 'AG1000G-BF-C',
 'AG1000G-CD',
 'AG1000G-CF',
 'AG1000G-CI',
 'AG1000G-CM-A',
 'AG1000G-CM-B',
 'AG1000G-CM-C',
 'AG1000G-FR',
 'AG1000G-GA-A',
 'AG1000G-GH',
 'AG1000G-GM-A',
 'AG1000G-GM-B',
 'AG1000G-GM-C',
 'AG1000G-GN-A',
 'AG1000G-GN-B',
 'AG1000G-GQ',
 'AG1000G-GW',
 'AG1000G-KE',
 'AG1000G-ML-A',
 'AG1000G-ML-B',
 'AG1000G-MW',
 'AG1000G-MZ',
 'AG1000G-TZ',
 'AG1000G-UG',
 'AG1000G-X',
 '1177-VO-ML-LEHMANN-VMF00004',
 '1188-VO-NIANG-NIEL-SN-2304-VMF00259',
 '1270-VO-MULTI-PAMGEN-VMF00244',
 '1330-VO-GN-LAMA-VMF00250',
 'fontaine-2015-rebuild',
 '1296-VO-BF-DIABATE-VMF00272',
 '1351-VO-SS-WEETMAN-VMF00282',
 '1338-VO-NG-ADEDAPO-VMF00268',
 '1324-VO-ET-GOLASSA-VMF00257',
 '1324-VO-ET-GOLASSA-VMF00275',
 '1339-VO-GH-AMENGA-ETEGO-VMF00302',
 '1177-VO-ML-LEHMANN-VMF00015',
 '1237-VO-BJ-DJOGBENOU-VMF00050',
 '1237-VO-BJ-DJOGBENOU-VMF00067',
 '1244-VO-GH-YAWSON-VMF00051',
 '1245-VO-CI-CONSTANT-VMF00054',
 '1253-VO-TG-DJOGBENOU-VMF00052

In [5]:
meta_df = ag3.sample_metadata(sample_sets=sample_sets)
y = meta_df.taxon.copy()

                                     

In [6]:
chromosomes = ["2R", "2L", "3R", "3L", "X"]
gt_dask_arrays = []

In [7]:
for chrom in chromosomes:
    callset = ag3.snp_calls(region=chrom, sample_sets=sample_sets, chunks='auto')
    gt_dask_arrays.append(callset['call_genotype'])


                                 

In [8]:
gt = da.concatenate(gt_dask_arrays, axis=0)

In [9]:
X = gt.sum(axis=2, dtype='float16').T

In [10]:
X = da.where(X < 0, np.nan, X)

In [11]:
missing_variant = da.isnan(X).mean(axis=0)
X = X[:, missing_variant < 0.05]

In [12]:
missing_sample = da.isnan(X).mean(axis=1)
X = X[missing_sample < 0.05, :]

In [None]:
X = X.compute_chunk_sizes()

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


In [None]:
X = X.rechunk({1: -1})

In [None]:
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

In [None]:
allele_freqs = da.mean(X_imputed, axis=0) / 2
maf = da.minimum(allele_freqs, 1 - allele_freqs)
X_maf_filtered = X_imputed[:, maf > 0.01]

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_maf_filtered)

In [None]:
X_scaled.shape

In [None]:
n_components = 2
pca = PCA(n_components=n_components, svd_solver='randomized')

In [None]:
X_pca = pca.fit_transform(X_scaled).compute()

In [None]:
final_sample_mask = missing_sample_mask.compute()
y_filtered = y_original[final_sample_mask]

In [None]:
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
pca_df['taxon'] = y_filtered.values

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='species', alpha=0.8, s=50, edgecolor='k')
plt.title('PCA of Anopheles SNP Data (Full Ag3.0 Dataset via Dask)')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
plt.legend(title='taxon')
plt.tight_layout()
plt.show()

In [None]:
client.close()