In [5]:
# Importing necessary libraries
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
import numpy as np
import anndata as ad
import scanpy as sc
from scipy.stats import uniform, randint
from sklearn.model_selection import ParameterSampler
from sklearn.manifold import TSNE, trustworthiness
import umap
from sklearn.metrics import pairwise_distances
import itertools
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering, SpectralClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from gprofiler import GProfiler
import warnings
warnings.filterwarnings('ignore')

# Add project root to Python path
import sys
import os
project_dir = "/Users/elifhamali/scRNAseq-Pipeline-GastricCancer"
if project_dir not in sys.path:
    sys.path.append(project_dir)

# Import our processor
from src.preprocessing.data_processor import SingleCellProcessor

In [11]:
# Initialize the SingleCellProcessor object
data_path = "/Users/elifhamali/scRNAseq-Pipeline-GastricCancer/data/raw/GSE158631_count.csv"
analysis = SingleCellProcessor(data_path)

# Verify that the data was loaded
if analysis.adata is not None:
    print(f"Data loaded successfully:")
    print(f"Number of cells: {analysis.adata.n_obs}")
    print(f"Number of genes: {analysis.adata.n_vars}")

Data loaded successfully: 94 cells and 21196 genes
Data loaded successfully:
Number of cells: 94
Number of genes: 21196


In [12]:
print(analysis.adata) # checking the shape of the AnnData object (cells x genes)
analysis.adata.to_df() # displaying the AnnData object as a dataframe

AnnData object with n_obs × n_vars = 94 × 21196


Unnamed: 0,THY1,DCN,COL1A1,COL1A2,COL6A1,COL6A2,COL6A3,PECAM1,VWF,CDH5,...,RPS4Y2,RBMY2EP,TTTY13,TTTY6,TTTY5,RBMY2FP,DAZ1,DAZ4,DAZ2,DAZ3
GC1-TT1,0,0,0,1,0,0,1,10,11,0,...,0,0,0,0,0,0,0,0,0,1
GC1-TT2,0,0,0,1,0,0,0,1,3,0,...,0,0,0,0,2,0,0,0,0,1
GC1-TT3,0,0,0,1,0,0,12,0,14,0,...,0,0,2,0,0,0,1,0,0,0
GC1-TT4,0,0,0,10,0,0,1,2,0,4,...,0,1,0,0,0,3,0,1,0,1
GC1-TT5,0,0,0,5,0,0,13,0,0,0,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GC3-LN8,0,1,0,7,0,0,0,0,6,0,...,0,0,0,0,0,0,0,0,0,0
GC3-LN9,1,0,7,5,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
GC3-LN10,0,0,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
GC3-LN11,0,2,2,1,2,0,1,0,2,0,...,0,0,0,0,0,0,0,0,0,1


In [14]:
# Perform quality control
analysis.perform_qc()

# Now try displaying QC metrics
print("QC metrics:")
display(analysis.adata.obs)

QC metrics computed successfully
QC metrics:


Unnamed: 0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,total_counts_mt,log1p_total_counts_mt,pct_counts_mt,total_counts_rp,log1p_total_counts_rp,pct_counts_rp,total_counts_hb,log1p_total_counts_hb,pct_counts_hb
GC1-TT1,4584,8.430545,15044,9.618801,0,0.0,0.0,145,4.983607,0.963839,63,4.158883,0.418772
GC1-TT2,5153,8.547528,24631,10.111802,0,0.0,0.0,106,4.672829,0.430352,92,4.532599,0.373513
GC1-TT3,5633,8.636575,19521,9.879297,0,0.0,0.0,81,4.406719,0.414938,85,4.454347,0.435429
GC1-TT4,4958,8.508959,20751,9.940398,0,0.0,0.0,57,4.060443,0.274686,89,4.499810,0.428895
GC1-TT5,3960,8.284252,16139,9.689056,0,0.0,0.0,161,5.087596,0.997583,174,5.164786,1.078134
...,...,...,...,...,...,...,...,...,...,...,...,...,...
GC3-LN8,3760,8.232440,9270,9.134647,0,0.0,0.0,24,3.218876,0.258900,100,4.615121,1.078749
GC3-LN9,4663,8.447629,11077,9.312716,0,0.0,0.0,38,3.663562,0.343053,74,4.317488,0.668051
GC3-LN10,4178,8.337827,9912,9.201602,0,0.0,0.0,25,3.258097,0.252220,50,3.931826,0.504439
GC3-LN11,3590,8.186186,10841,9.291183,0,0.0,0.0,42,3.761200,0.387418,75,4.330733,0.691818


In [15]:
# Displaying the QC metrics for each cell
analysis.adata.obs

Unnamed: 0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,total_counts_mt,log1p_total_counts_mt,pct_counts_mt,total_counts_rp,log1p_total_counts_rp,pct_counts_rp,total_counts_hb,log1p_total_counts_hb,pct_counts_hb
GC1-TT1,4584,8.430545,15044,9.618801,0,0.0,0.0,145,4.983607,0.963839,63,4.158883,0.418772
GC1-TT2,5153,8.547528,24631,10.111802,0,0.0,0.0,106,4.672829,0.430352,92,4.532599,0.373513
GC1-TT3,5633,8.636575,19521,9.879297,0,0.0,0.0,81,4.406719,0.414938,85,4.454347,0.435429
GC1-TT4,4958,8.508959,20751,9.940398,0,0.0,0.0,57,4.060443,0.274686,89,4.499810,0.428895
GC1-TT5,3960,8.284252,16139,9.689056,0,0.0,0.0,161,5.087596,0.997583,174,5.164786,1.078134
...,...,...,...,...,...,...,...,...,...,...,...,...,...
GC3-LN8,3760,8.232440,9270,9.134647,0,0.0,0.0,24,3.218876,0.258900,100,4.615121,1.078749
GC3-LN9,4663,8.447629,11077,9.312716,0,0.0,0.0,38,3.663562,0.343053,74,4.317488,0.668051
GC3-LN10,4178,8.337827,9912,9.201602,0,0.0,0.0,25,3.258097,0.252220,50,3.931826,0.504439
GC3-LN11,3590,8.186186,10841,9.291183,0,0.0,0.0,42,3.761200,0.387418,75,4.330733,0.691818


In [16]:
# Displaying the QC metrics for each gene
analysis.adata.var

Unnamed: 0,mt,rp,hb,n_cells_by_counts,mean_counts,log1p_mean_counts,pct_dropout_by_counts,total_counts,log1p_total_counts
THY1,False,False,False,22,0.904255,0.644091,76.595745,85,4.454347
DCN,False,False,False,32,0.936170,0.660712,65.957447,88,4.488636
COL1A1,False,False,False,36,1.489362,0.912026,61.702128,140,4.948760
COL1A2,False,False,False,56,2.765957,1.326002,40.425532,260,5.564520
COL6A1,False,False,False,13,0.712766,0.538110,86.170213,67,4.219508
...,...,...,...,...,...,...,...,...,...
RBMY2FP,False,False,False,7,0.159574,0.148053,92.553191,15,2.772589
DAZ1,False,False,False,3,0.031915,0.031416,96.808511,3,1.386294
DAZ4,False,False,False,3,0.031915,0.031416,96.808511,3,1.386294
DAZ2,False,False,False,5,0.053191,0.051825,94.680851,5,1.791759


In [17]:
# Normalizing the data (TPM and log2 transformation)
analysis.normalize_adata()

AttributeError: 'SingleCellProcessor' object has no attribute 'normalize_adata'

In [18]:
print(analysis.adata) # checking the shape of the AnnData object (cells x genes)
analysis.adata.to_df() # displaying the AnnData object as a dataframe

AnnData object with n_obs × n_vars = 94 × 21196
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_rp', 'log1p_total_counts_rp', 'pct_counts_rp', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb'
    var: 'mt', 'rp', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'


Unnamed: 0,THY1,DCN,COL1A1,COL1A2,COL6A1,COL6A2,COL6A3,PECAM1,VWF,CDH5,...,RPS4Y2,RBMY2EP,TTTY13,TTTY6,TTTY5,RBMY2FP,DAZ1,DAZ4,DAZ2,DAZ3
GC1-TT1,0,0,0,1,0,0,1,10,11,0,...,0,0,0,0,0,0,0,0,0,1
GC1-TT2,0,0,0,1,0,0,0,1,3,0,...,0,0,0,0,2,0,0,0,0,1
GC1-TT3,0,0,0,1,0,0,12,0,14,0,...,0,0,2,0,0,0,1,0,0,0
GC1-TT4,0,0,0,10,0,0,1,2,0,4,...,0,1,0,0,0,3,0,1,0,1
GC1-TT5,0,0,0,5,0,0,13,0,0,0,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GC3-LN8,0,1,0,7,0,0,0,0,6,0,...,0,0,0,0,0,0,0,0,0,0
GC3-LN9,1,0,7,5,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
GC3-LN10,0,0,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
GC3-LN11,0,2,2,1,2,0,1,0,2,0,...,0,0,0,0,0,0,0,0,0,1


In [19]:
# Filtering the data based on the QC metrics
analysis.filter_adata(ribosomal_threshold=50, mitochondrial_threshold=20) # 50% threshold for ribosomal genes and 20% threshold for mitochondrial genes

AttributeError: 'SingleCellProcessor' object has no attribute 'filter_adata'

In [None]:
print(analysis.adata) # checking the shape of the AnnData object (cells x genes)
analysis.adata.to_df() # displaying the AnnData object as a dataframe

In [None]:
analysis.prepare_adata() # preparing the AnnData object for further analysis
analysis.adata.obs # displaying the AnnData cell metadata after preparation

In [None]:
# Visualizing the top 20 most highly expressed genes
sc.pl.highest_expr_genes(analysis.adata, n_top=20)

In [None]:
# Performing PCA on the preprocessed dataset with the computed optimal number of components
analysis.prepare_pca_reduced_adata(variance_threshold=0.90)

In [None]:
# Plotting the PCA results in a pairplot
analysis.plot_pca()

In [None]:
analysis.perform_tsne(optimize=False, perplexity=5, n_iter=5000, random_state=42) # performing t-SNE on the preprocessed dataset with our chosen parameters
analysis.plot_tsne() # plotting the t-SNE results

In [None]:
# Defining the clustering methods
methods = ['gmm', 'average_link', 'ward', 'spectral', 'louvain', 'leiden']