In [None]:
# wilcoxon using v3



import scanpy as sc
import anndata as ad
import pandas as pd
from scipy.sparse import csr_matrix, vstack
import random
import numpy as np
import random
from scipy.io import mmread, mmwrite
from sklearn.neighbors import NearestNeighbors
import plotly.graph_objects as go
#import igraph
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
from itertools import combinations
import gzip

Read the GTF File

In this step, we open and read the compressed GTF file containing gene annotations using the gzip module. The data is loaded into a pandas DataFrame where each line represents an annotation entry. We specify that the file is tab-separated, comments start with #, and there is no header row.

In [None]:
# Read the GTF file
with gzip.open('D:/newgenes/data/Homo_sapiens.GRCh38.112.chr.gtf.gz', 'rt') as f:
    # Read it into a DataFrame
    df = pd.read_csv(f, sep='\t', comment='#', header=None)

# Filter for gene entries
genes = df[df[2] == 'gene']

# Extract relevant columns (chromosome, start, end, attributes)
gene_info = genes[[0, 3, 4, 8]]  # 0: chromosome, 3: start, 4: end, 8: attributes
gene_info['gene_id'] = gene_info[8].str.extract(r'gene_id "([^"]+)"')
gene_info['gene_name'] = gene_info[8].str.extract(r'gene_name "([^"]+)"')

# Final DataFrame with gene names, chromosome, start and end positions
final_genes = gene_info[[0, 'gene_name', 3, 4, "gene_id"]].drop_duplicates().reset_index(drop=True)

# Rename columns for clarity
final_genes = final_genes.rename(columns={0: "chromosome", 3: "start", 4: "end"})
final_genes = final_genes.rename(columns={"gene_name": "gene_short_name"})
final_genes["chromosome"] = final_genes["chromosome"].astype(str)

# Display the final DataFrame
print(final_genes)
just_loc = final_genes[["gene_short_name", "start", "end"]]

just_loc = just_loc.dropna(subset=['gene_short_name'])

# Step 1: Remove duplicates based on 'gene_short_name' and keep the first occurrence
just_loc = just_loc.drop_duplicates(subset='gene_short_name', keep='first')

  df = pd.read_csv(f, sep='\t', comment='#', header=None)


      chromosome gene_short_name    start      end          gene_id
0              1             NaN  2581560  2584533  ENSG00000228037
1              1          PRDM16  3069168  3438621  ENSG00000142611
2              1             NaN  5301928  5307394  ENSG00000284616
3              1           PEX10  2403964  2413797  ENSG00000157911
4              1             NaN  5492978  5494674  ENSG00000260972
...          ...             ...      ...      ...              ...
63081         MT          MT-ND6    14149    14673  ENSG00000198695
63082         MT           MT-TE    14674    14742  ENSG00000210194
63083         MT          MT-CYB    14747    15887  ENSG00000198727
63084         MT           MT-TT    15888    15953  ENSG00000210195
63085         MT           MT-TP    15956    16023  ENSG00000210196

[63086 rows x 5 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_info['gene_id'] = gene_info[8].str.extract(r'gene_id "([^"]+)"')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_info['gene_name'] = gene_info[8].str.extract(r'gene_name "([^"]+)"')


Load and Filter the Data

In this step, we load an H5AD file containing single-cell RNA-seq data using scanpy. The dataset is then filtered to exclude samples with the ID 115.010n. The code also reads a list of constitutively escaped genes from a CSV file, but in this case, the gene list is hardcoded as a Python list for further processing.

In [None]:
adata = ad.read_h5ad('D:/newgenes/data/scanorama_full_leiden_v3_fil_no_unk.h5ad')
adata = adata[adata.obs["sampleID"] != "115.010n"]

# Read the CSV file into a DataFrame
#cons_escape_genes = pd.read_csv('D:/newgenes/data/escape_genes_constitutive.csv', header=None)
cons_escape_genes = ['PLCXD1', 'GTPBP6', 'PPP2R3B', 'SHOX', 'CSF2RA', 'IL3RA', 'SLC25A6', 'ASMTL', 'P2RY8', 'AKAP17A', 'ASMT', 'DHRSX', 'ZBED1', 'CD99', 'GYG2', 'ARSD', 'MXRA5', 'PRKX', 'NLGN4X', 'STS', 'PUDP', 'PNPLA4', 'ANOS1', 'FAM9C', 'TCEANC', 'RAB9A', 'TRAPPC2', 'OFD1', 'GPM6B', 'GEMIN8', 'CA5B', 'ZRSR2', 'AP1S2', 'S100G', 'CTPS2', 'SYAP1', 'TXLNG', 'RBBP7', 'EIF1AX', 'EIF2S3', 'ZFX', 'CXorf38', 'USP9X', 'DDX3X', 'FUNDC1', 'KDM6A', 'UBA1', 'CDK16', 'KDM5C', 'IQSEC2', 'SMC1A', 'RPS4X', 'JPX', 'HTR2C', 'SPRY3', 'VAMP7', 'IL9R']
# Convert the DataFrame to a list
cons_eg = cons_escape_genes

# Now `values_list` contains the values from the CSV
print(cons_eg)

Make Gene Names Unique and Prepare Data

In this cell, we ensure that the gene names in the adata object are unique by using the var_names_make_unique() function. We also add a new column for the gene short names. Additionally, we load a CSV file containing variable information (dis_var_fil.csv), which includes gene names and chromosome data, and filter it to only retain the relevant columns: gene_short_name and chromosome.

In [None]:
adata.var_names_make_unique()
adata.var["gene_short_name"] = adata.var.index
var_test = pd.read_csv("D:/newgenes/data/dis_var_fil.csv")

var_test["gene_short_name"] = var_test["gene_short_name"].values
var_test = var_test[["gene_short_name", "chromosome"]]

Identify and Filter Duplicates

This cell identifies duplicate values in the gene_short_name column of the var_test DataFrame. First, we mark the duplicated gene names, keeping only the first occurrence. Then, we extract the gene names that appear exactly once (unique values). Finally, the DataFrame is filtered to retain only the unique values and the first occurrences of any duplicates.

In [None]:
# Identify duplicated values
duplicates = var_test.duplicated('gene_short_name', keep='first')

# Find non-duplicated values (unique values appearing exactly once)
value_counts = var_test['gene_short_name'].value_counts()
unique_values = value_counts[value_counts == 1].index

# Filter DataFrame to include unique values and first occurrences of duplicates
var_test = var_test[var_test['gene_short_name'].isin(unique_values) | ~duplicates]

Merge and Add Chromosome Information

In this step, the var_test DataFrame is merged with the adata.var DataFrame on the gene_short_name column to add chromosome information to the adata object. The merge is performed using a left join to ensure all entries in adata.var are retained. Duplicates are then removed, and the chromosome data is added to the adata.var DataFrame.

In [None]:
test_123 = pd.merge(adata.var, var_test, on = 'gene_short_name', how='left')
test_123 = test_123.drop_duplicates()
adata.var["chromosome"] = test_123["chromosome"].values

Create and Merge Gene List

In this cell, a new DataFrame par_df is created using the list of constitutively escaped genes (cons_eg), with a new column indicating that these genes are "escape" genes. This DataFrame is then merged with the adata.var DataFrame to add the escape gene information. The merge is performed using a left join to ensure that all genes in adata.var are retained.

In [None]:
# List of genes

# Create a DataFrame
par_df = pd.DataFrame(cons_eg, columns=['gene_short_name'])
par_df['Is_escape'] = True

par_full = pd.merge(adata.var, par_df, on="gene_short_name", how = "left")

Group and Analyze Cluster Information

In this cell, the leiden column is converted to an integer type and used for further analysis. The cao_obs subset is created by filtering the data based on the "origin" column. Then, the data is grouped by leiden and Main_cluster_name, and the count of rows in each group is calculated. Percentages of each cluster are computed relative to the total number of rows for each leiden value.

Additionally, the most common Main_cluster_name for each leiden is identified. The resulting data is sorted, and the leiden column is re-categorized based on this sorted order to ensure the correct cluster sequence. Finally, the results are merged with the counts data, and unnecessary columns are dropped for clarity.

In [None]:
adata.obs["leiden"] = adata.obs["leiden"].astype(int) 
cao_obs = adata.obs[adata.obs["origin"] == "Cao"]
# Step 1: Group by 'leiden' and 'Main_cluster_name', and count rows
counts = cao_obs.groupby(['leiden', 'Main_cluster_name']).size().reset_index(name='count')
counts["leiden"] = counts["leiden"].astype(int)
# Step 2: Calculate total counts per leiden value
total_counts = cao_obs.groupby('leiden').size()

# Step 3: Compute percentages
counts['total_rows'] = counts['leiden'].map(total_counts)  # Map total counts to each row
counts['percentage'] = (counts['count'] / counts['total_rows']) * 100
# Step 1: Find the most common Main_cluster_name for each leiden
most_common = cao_obs.groupby('leiden')['Main_cluster_name'].agg(lambda x: x.mode()[0]).reset_index(name='most_common')

# Step 2: Calculate total counts per leiden value
total_counts = cao_obs.groupby('leiden').size()

# Step 3: Compute percentages correctly
most_common['total_rows'] = most_common['leiden'].map(total_counts)  # Map total counts to each row

# Display the result
most_common = most_common.sort_values(by='most_common')




# Get a list of the values in the "leiden" column in the new order
cluster_order = most_common['leiden'].tolist()
most_common_list = most_common["most_common"].tolist()
cluster_len = len(cluster_order)

adata.obs["leiden"] = adata.obs["leiden"].astype(int).astype("category")
adata.obs['leiden'] = adata.obs['leiden'].cat.set_categories(cluster_order)

# Merge `most_common` with `counts` on ('leiden', 'most_common') using a left join
result = pd.merge(most_common, counts, left_on=['leiden', 'most_common'], right_on=['leiden', 'Main_cluster_name'], how='left')
result.drop("Main_cluster_name", axis=1, inplace=True)
result.drop("total_rows_y", axis=1, inplace=True)

# Rename column 'A' to 'new_column_name'
result.rename(columns={'total_rows_x': 'total_cells'}, inplace=True)

  counts = cao_obs.groupby(['leiden', 'Main_cluster_name']).size().reset_index(name='count')


Assign Predicted Cell Types and Prepare Data Subsets

This cell creates a DataFrame cluster_celltype that maps each leiden cluster to a predicted cell type, and merges it with the adata.obs DataFrame to assign the predicted cell types to the cells in adata. The predicted_cell_type column is then added to adata.obs.

Additionally, subsets of the adata object are created for different origins ("Disteche", "Ian", and "Cao") to facilitate further analysis.

Finally, the gene_short_name columns in the var_test and par_df DataFrames are renamed to Gene for consistency and to align with the gene naming conventions in the other DataFrames.

In [None]:
cluster_celltype = pd.DataFrame({'leiden': cluster_order, 'predicted_cell_type': most_common_list})

merged_df = pd.merge(adata.obs, cluster_celltype, on='leiden', how='left')

adata.obs['predicted_cell_type'] = merged_df["predicted_cell_type"].values

adata_dis = adata[adata.obs["origin"] == "Disteche"]
adata_ian = adata[adata.obs["origin"] == "Ian"]
adata_cao = adata[adata.obs["origin"] == "Cao"]

var_test.rename(columns={'gene_short_name': 'Gene'}, inplace=True)
par_df.rename(columns={'gene_short_name': 'Gene'}, inplace=True)


Prepare and Modify Cell Type List

In this step, a list of unique predicted cell types is extracted from the adata.obs["predicted_cell_type"] column and stored in cell_type_list. The list is then sorted, and a predefined list of cell types (cell_type_list1) is added for further analysis. The cell type "SLC24A4_PEX5L positive cells" is removed from cell_type_list to ensure it is excluded from subsequent analysis.

In [None]:
cell_type_list = adata.obs["predicted_cell_type"].unique().tolist()
cell_type_list1 = ["Inhibitory neurons", "Limbic system neurons","Microglia", "Oligodendrocytes", "Purkinje neurons", "Unipolar brush cells", "Vascular endothelial cells"]
cell_type_list.sort()
cell_type_list.remove("SLC24A4_PEX5L positive cells")

Filter the var DataFrame and Update adata_dis

In this step, we access the var DataFrame of the adata_dis object and filter out genes located on the mitochondrial chromosome (chromosome == 'MT'). The filtered DataFrame is used to update the adata_dis object, keeping only the genes that pass this condition.

Next, a second filter is applied to remove rows where the chromosome information is missing (NaN). The adata_dis_fil object is updated once again, retaining only the genes that have valid chromosome information.

In [None]:
# Access the var dataframe
var_df = adata_dis.var

# Apply the filtering condition
filtered_var_df = var_df[~(var_df['chromosome'] == 'MT')]

# Update the dis_adata object with the filtered var dataframe
adata_dis_fil = adata_dis[:, filtered_var_df.index].copy()

# Access the var dataframe
var_df = adata_dis_fil.var

# Apply the filtering condition
filtered_var_df = var_df.dropna(subset=['chromosome'])

# Update the dis_adata object with the filtered var dataframe
adata_dis_fil = adata_dis_fil[:, filtered_var_df.index].copy()

Merge Gene Location and Filter by Age Group

In this cell, the just_loc DataFrame (which contains gene location data) is merged with the var DataFrame of the adata_dis_fil object to add the gene location information based on the gene_short_name. The index of the var DataFrame is then updated to use the gene_short_name.

Next, the data is split into two subsets based on the age column: one for individuals aged 105 or younger (adata_dis_fh) and another for those aged 110 or older (adata_dis_sh).

The normalize_total function is applied to normalize each subset to a total count of 1 million, and the log1p transformation is performed to log-transform the data.

In [None]:
adata_dis_fil.var = pd.merge(adata_dis_fil.var, just_loc, on='gene_short_name', how='left')
adata_dis_fil.var.index = adata_dis_fil.var["gene_short_name"].values

adata_dis_fh = adata_dis_fil[adata_dis_fil.obs["age"] <= 105].copy()
adata_dis_sh = adata_dis_fil[adata_dis_fil.obs["age"] > 110].copy()

sc.pp.normalize_total(adata_dis_fh, target_sum=1e6)
sc.pp.log1p(adata_dis_fh)

sc.pp.normalize_total(adata_dis_sh, target_sum=1e6)
sc.pp.log1p(adata_dis_sh)

Differential Gene Expression Analysis for Cell Types and Genotypes

In this section, we perform differential gene expression analysis for different cell types and genotypes (XXY vs XX and XY). The analysis is carried out for each cell type in the cell_type_list. For each cell type, the expression data is filtered based on the genotype, and the Wilcoxon rank-sum test is used to identify differentially expressed genes between the specified genotypes.

The following steps are performed:

    Gene Expression Calculation: The mean gene expression for each genotype is calculated and added to the var DataFrame.
    Wilcoxon Test: The Wilcoxon rank-sum test is applied to detect differentially expressed genes between the selected genotypes.
    Merging Results: The test results are merged with additional gene information such as chromosome, start, and end positions, along with gene names and expression values.
    Filtering and Sorting: Genes are filtered based on adjusted p-value < 0.05, and the data is sorted by log fold change. Positive and negative results are handled separately.
    Saving Results: The results for significant and non-significant genes are saved as CSV files for both positive and negative log fold changes.

This analysis is repeated for each combination of cell type and genotype, with the results saved for further inspection.

In [None]:
# all DEG code
# For comparing X and XXY to other genotypes
# most updated code


for cell_type in cell_type_list:
    print(cell_type)
    for geno in ["XXY"]:
        if geno == "X":
            adata_dis_fil_cont = adata_dis_fh.copy()
        else:
            adata_dis_fil_cont = adata_dis_sh.copy()
        
        for cont in ["XX", "XY"]:
            if cont == "XX":
                adata_dis_fil_fin = adata_dis_fil_cont[adata_dis_fil_cont.obs["genotype"] != "XY"]
            else:
                adata_dis_fil_fin = adata_dis_fil_cont[adata_dis_fil_cont.obs["genotype"] != "XX"]
            
            adata_ct = adata_dis_fil_fin[adata_dis_fil_fin.obs["predicted_cell_type"] == cell_type]
            if len(adata_ct.obs["genotype"].value_counts()) < 2:
                continue
            sc.pp.filter_genes(adata_ct, min_cells=10)
            print(adata_ct.obs["genotype"].value_counts())
            # Step 1: Extract the expression matrix and the observation DataFrame
            X = adata_ct.X.todense()
            obs = adata_ct.obs
            var = adata_ct.var
            
            # Step 2: Convert the expression matrix to a DataFrame
            expression_df = pd.DataFrame(X, index=obs.index, columns=var.index)
            
            # Step 3: Group by genotype and calculate the mean expression for each gene
            mean_expression = expression_df.groupby(obs['genotype']).mean()
            
            # Step 4: Append mean values to the var DataFrame
            # Adding columns for each genotype
            for genotype in mean_expression.index:
                var[f'mean_expression_{genotype}'] = mean_expression.loc[genotype].values
            
            sc.tl.rank_genes_groups(adata_ct, 'genotype', groups=[geno], method='wilcoxon', key_added="wilcoxon")
            
            wilcoxon_results = adata_ct.uns['wilcoxon']
            names = list(itertools.chain.from_iterable(wilcoxon_results['names'].tolist()))
            pvals = list(itertools.chain.from_iterable(wilcoxon_results['pvals'].tolist()))  
            pvals_adj = list(itertools.chain.from_iterable(wilcoxon_results['pvals_adj'].tolist())) 
            logfoldchanges = list(itertools.chain.from_iterable(wilcoxon_results['logfoldchanges'].tolist())) 

            results_df = pd.DataFrame({
                'Gene': names,
                'Log Fold Change': logfoldchanges,
                'P-value': pvals,
                'Adjusted P-value': pvals_adj,
            })
            add_info = adata_ct.var[["gene_short_name", "start", "end", f"mean_expression_{geno}", f"mean_expression_{cont}"]]
            # Merging with additional dataframes
            sc_wilcoxon = results_df
            sc_wilcoxon = pd.merge(sc_wilcoxon, var_test, on="Gene", how="left")
            sc_wilcoxon = pd.merge(sc_wilcoxon, par_df, on="Gene", how="left")

            sc_wilcoxon = sc_wilcoxon.merge(add_info, left_on='Gene', right_on='gene_short_name', how='left')
            sc_wilcoxon.drop(columns=["gene_short_name"], inplace=True)

            # Create a mask to filter out rows where any of the required columns are NaN
            mask = sc_wilcoxon[['chromosome', 'start', 'end']].notna().all(axis=1)
            
            # Create loc_tag only for rows that meet the criteria
            sc_wilcoxon.loc[mask, "location_tag"] = (
                "chr" + sc_wilcoxon.loc[mask, "chromosome"].astype(str) + ":" +
                sc_wilcoxon.loc[mask, "start"].astype(float).astype(int).astype(str) + "-" +
                sc_wilcoxon.loc[mask, "end"].astype(float).astype(int).astype(str)
            )
            
            # Fill NaN for loc_tag where the mask is False
            sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)


            adj = sc_wilcoxon[sc_wilcoxon["Adjusted P-value"] < 0.05]
            print(f"Number of statistically significant genes between {geno} and {cont} for cell type: {cell_type}", len(adj)) # number of differentially expressed genes that are statistically significant    

            # # # Filter for Log Fold Change > 0
            positive_df = sc_wilcoxon[sc_wilcoxon["Log Fold Change"] > 0]
            negative_df = sc_wilcoxon[sc_wilcoxon["Log Fold Change"] < 0]

            # # Combine positive results into one DataFrame
            pos_significant = positive_df[positive_df["Adjusted P-value"] < 0.05]
            pos_significant = pos_significant.sort_values("Log Fold Change", ascending = False)
            pos_non_significant = positive_df[positive_df["Adjusted P-value"] >= 0.05]
            pos_non_significant = pos_non_significant.sort_values("Log Fold Change", ascending = False)
            combined_pos_df = pd.concat([pos_significant, pos_non_significant])

            

            
            # # Combine negative results into one DataFrame
            neg_significant = negative_df[negative_df["Adjusted P-value"] < 0.05]
            neg_significant = neg_significant.sort_values("Log Fold Change", ascending = True)
            neg_non_significant = negative_df[negative_df["Adjusted P-value"] >= 0.05]
            neg_non_significant = neg_non_significant.sort_values("Log Fold Change", ascending = True)
            combined_neg_df = pd.concat([neg_significant, neg_non_significant])

            # create loop that does this for all files in the wilcoxon_all folder



            # # Save combined positive results
            combined_pos_df.to_csv(f'D:/newgenes/results/wilcoxon_xxy/{cell_type}_{geno}_{cont}_combined_positive.csv', index=False)

            # # Save combined negative results
            combined_neg_df.to_csv(f'D:/newgenes/results/wilcoxon_xxy/{cell_type}_{geno}_{cont}_combined_negative.csv', index=False)

Astrocytes


  adata.var["n_cells"] = number
  mean_expression = expression_df.groupby(obs['genotype']).mean()


genotype
XX     1575
XXY     143
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)
  adata.var["n_cells"] = number
  mean_expression = expression_df.groupby(obs['genotype']).mean()


Number of statistically significant genes between XXY and XX for cell type: Astrocytes 11
genotype
XY     175
XXY    143
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)


Number of statistically significant genes between XXY and XY for cell type: Astrocytes 1
Excitatory neurons


  adata.var["n_cells"] = number


genotype
XX     53437
XXY    11745
Name: count, dtype: int64


  mean_expression = expression_df.groupby(obs['genotype']).mean()
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)


Number of statistically significant genes between XXY and XX for cell type: Excitatory neurons 3010


  adata.var["n_cells"] = number


genotype
XY     49486
XXY    11745
Name: count, dtype: int64


  mean_expression = expression_df.groupby(obs['genotype']).mean()
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)


Number of statistically significant genes between XXY and XY for cell type: Excitatory neurons 3680
Granule neurons


  adata.var["n_cells"] = number


genotype
XX     104
XXY     70
Name: count, dtype: int64


  mean_expression = expression_df.groupby(obs['genotype']).mean()
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)
  adata.var["n_cells"] = number
  mean_expression = expression_df.groupby(obs['genotype']).mean()
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon[

Number of statistically significant genes between XXY and XX for cell type: Granule neurons 0
genotype
XXY    70
XY     20
Name: count, dtype: int64
Number of statistically significant genes between XXY and XY for cell type: Granule neurons 3
Inhibitory interneurons


  adata.var["n_cells"] = number
  mean_expression = expression_df.groupby(obs['genotype']).mean()
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)
  adata.var["n_cells"] = number
  mean_expression = expression_df.groupby(obs['genotype']).mean()


genotype
XX     120
XXY     56
Name: count, dtype: int64
Number of statistically significant genes between XXY and XX for cell type: Inhibitory interneurons 2
genotype
XXY    56
XY      9
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)


Number of statistically significant genes between XXY and XY for cell type: Inhibitory interneurons 0
Inhibitory neurons


  adata.var["n_cells"] = number
  mean_expression = expression_df.groupby(obs['genotype']).mean()


genotype
XX     6842
XXY    2826
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)


Number of statistically significant genes between XXY and XX for cell type: Inhibitory neurons 1471


  adata.var["n_cells"] = number


genotype
XY     7437
XXY    2826
Name: count, dtype: int64


  mean_expression = expression_df.groupby(obs['genotype']).mean()
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)


Number of statistically significant genes between XXY and XY for cell type: Inhibitory neurons 1335
Limbic system neurons
Microglia


  adata.var["n_cells"] = number
  mean_expression = expression_df.groupby(obs['genotype']).mean()


genotype
XX     95
XXY    24
Name: count, dtype: int64
Number of statistically significant genes between XXY and XX for cell type: Microglia 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)
  adata.var["n_cells"] = number
  mean_expression = expression_df.groupby(obs['genotype']).mean()
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)


genotype
XY     42
XXY    24
Name: count, dtype: int64
Number of statistically significant genes between XXY and XY for cell type: Microglia 0
Oligodendrocytes


  adata.var["n_cells"] = number
  mean_expression = expression_df.groupby(obs['genotype']).mean()


genotype
XX     1456
XXY     236
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)
  adata.var["n_cells"] = number
  mean_expression = expression_df.groupby(obs['genotype']).mean()


Number of statistically significant genes between XXY and XX for cell type: Oligodendrocytes 49
genotype
XY     494
XXY    236
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)


Number of statistically significant genes between XXY and XY for cell type: Oligodendrocytes 8
Purkinje neurons


  adata.var["n_cells"] = number
  mean_expression = expression_df.groupby(obs['genotype']).mean()
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)


genotype
XX     35
XXY    22
Name: count, dtype: int64
Number of statistically significant genes between XXY and XX for cell type: Purkinje neurons 2
Unipolar brush cells


  adata.var["n_cells"] = number
  mean_expression = expression_df.groupby(obs['genotype']).mean()
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)
  adata.var["n_cells"] = number
  mean_expression = expression_df.groupby(obs['genotype']).mean()
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the o

genotype
XX     92
XXY    14
Name: count, dtype: int64
Number of statistically significant genes between XXY and XX for cell type: Unipolar brush cells 0
genotype
XXY    14
XY     13
Name: count, dtype: int64
Number of statistically significant genes between XXY and XY for cell type: Unipolar brush cells 0
Vascular endothelial cells


  adata.var["n_cells"] = number
  mean_expression = expression_df.groupby(obs['genotype']).mean()


genotype
XX     1241
XXY     153
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)
  adata.var["n_cells"] = number
  mean_expression = expression_df.groupby(obs['genotype']).mean()


Number of statistically significant genes between XXY and XX for cell type: Vascular endothelial cells 44
genotype
XY     546
XXY    153
Name: count, dtype: int64
Number of statistically significant genes between XXY and XY for cell type: Vascular endothelial cells 188


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sc_wilcoxon["location_tag"].fillna("NaN", inplace=True)


Cumulative Differential Gene Expression Results for Genotype Comparisons

This section performs differential gene expression analysis between different genotypes (e.g., "X" vs. "XX") for each cell type in the cell_type_list. The key steps are as follows:

    Data Filtering: The data is filtered by genotype and age group. The analysis is performed for the "X" genotype against "XX", and further comparisons are possible based on other genotypes.
    Wilcoxon Rank-Sum Test: The Wilcoxon rank-sum test is used to identify differentially expressed genes between the selected genotypes. The results are stored in a DataFrame containing gene names, log fold change, p-value, and adjusted p-value.
    Merging Results: The results for each cell type are merged into a cumulative DataFrame. This includes log fold changes, p-values, and other relevant information, organized by cell type and genotype.
    Result Compilation: The cumulative results are compiled for each genotype and cell type, storing them in the dictionary sig_gene_dict_sc.
    Printing Results: The number of statistically significant genes for each genotype comparison and cell type is printed.

By iterating over the genotypes and cell types, this script creates a comprehensive list of genes differentially expressed across various conditions, which can be saved for further analysis.

In [None]:
## to make cumulative page X XX

sig_gene_dict_sc = {
    "X_XX": [],
    "X_XY": [],
    "XXY_XX": [],
    "XXY_XY": []
}

def add_dataframes(compiled_df, new_df, cell_type):
    # Rename columns to include cell_type as prefix
    #print("print og_new_df")
    #print(new_df)
    new_df = new_df.rename(columns={
        "Log Fold Change": f"{cell_type} Log Fold Change",
        "P-value": f"{cell_type} P-value",
        "Adjusted P-Value": f"{cell_type} Adjusted P-value"
    })
    #print("First new_df")
    #print(new_df)
    #print(new_df.columns)
    # Ensure only relevant columns are kept
    new_df = new_df[["Gene", f"{cell_type} Log Fold Change", f"{cell_type} P-value", f"{cell_type} Adjusted P-value"]]
    #print("second new_df")
    #print(new_df)
    # Merge with the previously compiled dataframe
    if compiled_df is None:
        compiled_df = new_df
    else:
        compiled_df = pd.merge(compiled_df, new_df, on="Gene", how="outer")
    
    return compiled_df

x_xx = None

for cell_type in cell_type_list:
    for geno in ["X"]: #, "XXY"]:
        if geno == "X":
            adata_dis_fil = adata_dis[adata_dis.obs["age"] <= 105]
        else:
            adata_dis_fil = adata_dis[adata_dis.obs["age"] > 110]
        for cont in ["XX"]:#, "XY"]:
            if cont == "XX":
                adata_dis_fil_cont = adata_dis_fil[adata_dis_fil.obs["genotype"] != "XY"]
            else:
                adata_dis_fil_cont = adata_dis_fil[adata_dis_fil.obs["genotype"] != "XX"]
            #print(adata_dis_fil_cont.obs["genotype"].unique())
            adata_ct = adata_dis_fil_cont[adata_dis_fil_cont.obs["predicted_cell_type"] == cell_type] # iterate through each cell type
            if len(adata_ct.obs["genotype"].value_counts()) < 2:
                continue
            #sc.pp.filter_genes(adata_ct, min_counts=1)
            #print(adata_ct)
            sc.pp.normalize_total(adata_ct)
            sc.pp.log1p(adata_ct)
            print(cell_type, geno, cont)
            print(adata_ct.obs["genotype"].value_counts())

            sc.tl.rank_genes_groups(adata_ct, 'genotype', groups=[cont], method='wilcoxon', key_added = "wilcoxon")
            #print(adata_ct)

            # Accessing dictionaries from adata_ct.uns['wilcoxon']
            wilcoxon_results = adata_ct.uns['wilcoxon']

            # Extracting names, pvals_adj, and logfoldchanges
            names = list(itertools.chain.from_iterable(wilcoxon_results['names'].tolist()))
            pvals = list(itertools.chain.from_iterable(wilcoxon_results['pvals'].tolist()))  
            pvals_adj = list(itertools.chain.from_iterable(wilcoxon_results['pvals_adj'].tolist())) 
            logfoldchanges = list(itertools.chain.from_iterable(wilcoxon_results['logfoldchanges'].tolist())) 
            
            # Creating a DataFrame
            results_df = pd.DataFrame({
                'Gene': names,
                'Log Fold Change': logfoldchanges,
                'P-value': pvals,
                'Adjusted P-Value': pvals_adj,
            })
            
            sc_wilcoxon = results_df
            sc_wilcoxon = pd.merge(sc_wilcoxon, var_test, on="Gene", how="left")
            sc_wilcoxon = pd.merge(sc_wilcoxon, par_df, on="Gene", how="left")
            sc_wilcoxon = sc_wilcoxon.sort_values('Gene')

            
            #print(sc_wilcoxon)
            x_xx = add_dataframes(x_xx, sc_wilcoxon, cell_type)

            # sc_wilcoxon = sc_wilcoxon[sc_wilcoxon["Adjusted P-Value"] < 0.01]
            # sc_wilcoxon = sc_wilcoxon.sort_values('Log Fold Change', ascending = False)
            # sc_head = sc_wilcoxon.head(20)
            # sc_head = sc_head[sc_head["Log Fold Change"] > 0]
            # sc_tail = sc_wilcoxon.tail(20)
            # sc_tail = sc_tail[sc_tail["Log Fold Change"] < 0]

            # sc_wilcoxon_non_sig = results_df
            # sc_wilcoxon_non_sig = pd.merge(sc_wilcoxon_non_sig, var_test, on="Gene", how="left")
            # sc_wilcoxon_non_sig = pd.merge(sc_wilcoxon_non_sig, par_df, on="Gene", how="left")
            # sc_wilcoxon_non_sig = sc_wilcoxon_non_sig.sort_values('Log Fold Change', ascending = False)

            # if len(sc_head) < 20:
            #     remainder_h = sc_wilcoxon_non_sig.head(20 - len(sc_head))
            #     # Combine DataFrames row-wise
            #     result_head = pd.concat([sc_head, remainder_h], axis=0, ignore_index=True)
            #     result_head.to_csv(f"D:/newgenes/results/wilcoxon/{cell_type}_{geno}_{cont}_head20.csv", index=False)

            # else:
            #     sc_head.to_csv(f"D:/newgenes/results/wilcoxon/{cell_type}_{geno}_{cont}_head20.csv", index=False)
            
            # if len(sc_tail) < 20:
            #     remainder_t = sc_wilcoxon_non_sig.tail(20 - len(sc_tail))
            #     # Combine DataFrames row-wise
            #     result_tail = pd.concat([sc_tail, remainder_t], axis=0, ignore_index=True)
            #     result_tail.to_csv(f"D:/newgenes/results/wilcoxon/{cell_type}_{geno}_{cont}_tail20.csv", index=False)

            # else:
            #     sc_tail.to_csv(f"D:/newgenes/results/wilcoxon/{cell_type}_{geno}_{cont}_tail20.csv", index=False)

            print(f"Number of statistically significant genes between {geno} and {cont} for cell type: {cell_type}", len(sc_wilcoxon)) # number of differentially expressed genes that are statistically significant             
            # sc_par = sc_wilcoxon_non_sig[sc_wilcoxon_non_sig["Is_escape"] == True]
            # sc_par.to_csv(f"D:/newgenes/results/wilcoxon/{cell_type}_{geno}_{cont}_escape.csv", index=False)




  view_to_actual(adata)


Astrocytes X XX
genotype
X     6408
XX    3036
Name: count, dtype: int64
Number of statistically significant genes between X and XX for cell type: Astrocytes 28683


  view_to_actual(adata)


Excitatory neurons X XX
genotype
XX    31605
X     21507
Name: count, dtype: int64
Number of statistically significant genes between X and XX for cell type: Excitatory neurons 28683


  view_to_actual(adata)


Granule neurons X XX
genotype
XX    1045
X       73
Name: count, dtype: int64
Number of statistically significant genes between X and XX for cell type: Granule neurons 28683


  view_to_actual(adata)


Inhibitory interneurons X XX
genotype
XX    515
X      65
Name: count, dtype: int64
Number of statistically significant genes between X and XX for cell type: Inhibitory interneurons 28683


  view_to_actual(adata)


Inhibitory neurons X XX
genotype
X     41806
XX    21537
Name: count, dtype: int64
Number of statistically significant genes between X and XX for cell type: Inhibitory neurons 28683


  view_to_actual(adata)


Limbic system neurons X XX
genotype
X     6151
XX    4132
Name: count, dtype: int64
Number of statistically significant genes between X and XX for cell type: Limbic system neurons 28683


  view_to_actual(adata)


Microglia X XX
genotype
X     192
XX    114
Name: count, dtype: int64
Number of statistically significant genes between X and XX for cell type: Microglia 28683


  view_to_actual(adata)


Oligodendrocytes X XX
genotype
X     3396
XX    1472
Name: count, dtype: int64
Number of statistically significant genes between X and XX for cell type: Oligodendrocytes 28683


  view_to_actual(adata)


Purkinje neurons X XX
genotype
XX    3475
X        5
Name: count, dtype: int64
Number of statistically significant genes between X and XX for cell type: Purkinje neurons 28683


  view_to_actual(adata)


SLC24A4_PEX5L positive cells X XX
genotype
X     38
XX    28
Name: count, dtype: int64
Number of statistically significant genes between X and XX for cell type: SLC24A4_PEX5L positive cells 28683


  view_to_actual(adata)


Unipolar brush cells X XX
genotype
XX    56
X     35
Name: count, dtype: int64
Number of statistically significant genes between X and XX for cell type: Unipolar brush cells 28683


  view_to_actual(adata)


Vascular endothelial cells X XX
genotype
X     1691
XX    1042
Name: count, dtype: int64
Number of statistically significant genes between X and XX for cell type: Vascular endothelial cells 28683


In [None]:
x_xx = pd.merge(x_xx, var_test, on="Gene", how="left")
x_xx = pd.merge(x_xx, par_df, on="Gene", how="left")

In [None]:
x_xx.to_csv("D:/newgenes/results/wilcoxon/x_xx.csv", index=False)

Cumulative Differential Gene Expression Analysis for XXY vs. XY

This section performs differential gene expression analysis comparing the "XXY" genotype against "XY" for each cell type in the cell_type_list. The key steps and actions performed in this process are outlined below:

    Filtering Data by Genotype and Age: The adata_dis dataset is filtered based on genotype and age group. Specifically, this analysis compares the "XXY" genotype to the "XY" genotype.

    Gene Expression Normalization: The expression data for each cell type is normalized using sc.pp.normalize_total() and log-transformed with sc.pp.log1p() to adjust for any potential bias in sequencing depth.

    Wilcoxon Rank-Sum Test: For each cell type, a Wilcoxon rank-sum test is applied to determine differentially expressed genes between "XXY" and "XY". The test outputs include gene names, log fold changes, p-values, and adjusted p-values.

    Data Merging: The results from the Wilcoxon test for each cell type are merged into a cumulative DataFrame. The merged results include:
        Gene names
        Log Fold Change
        P-value
        Adjusted P-value
        Information from var_test and par_df dataframes, including additional gene annotations (e.g., start, end, chromosome).

    Result Compilation: After processing all cell types and genotypes, the results are compiled into a single DataFrame (xxy_xy) which contains the merged statistics across all cell types.

    Saving Results: The final results for the comparison between "XXY" and "XY" are saved to a CSV file (xxy_xy.csv), located in the specified directory.

In [None]:
## to make cumulative page XXY XY

sig_gene_dict_sc = {
    "X_XX": [],
    "X_XY": [],
    "XXY_XX": [],
    "XXY_XY": []
}

def add_dataframes(compiled_df, new_df, cell_type):
    # Rename columns to include cell_type as prefix
    #print("print og_new_df")
    #print(new_df)
    new_df = new_df.rename(columns={
        "Log Fold Change": f"{cell_type} Log Fold Change",
        "P-value": f"{cell_type} P-value",
        "Adjusted P-Value": f"{cell_type} Adjusted P-value"
    })
    #print("First new_df")
    #print(new_df)
    #print(new_df.columns)
    # Ensure only relevant columns are kept
    new_df = new_df[["Gene", f"{cell_type} Log Fold Change", f"{cell_type} P-value", f"{cell_type} Adjusted P-value"]]
    #print("second new_df")
    #print(new_df)
    # Merge with the previously compiled dataframe
    if compiled_df is None:
        compiled_df = new_df
    else:
        compiled_df = pd.merge(compiled_df, new_df, on="Gene", how="outer")
    
    return compiled_df

xxy_xy = None

for cell_type in cell_type_list:
    for geno in ["XXY"]: #, "XXY"]:
        if geno == "X":
            adata_dis_fil = adata_dis[adata_dis.obs["age"] <= 105]
        else:
            adata_dis_fil = adata_dis[adata_dis.obs["age"] > 110]
        for cont in ["XY"]:#, "XY"]:
            if cont == "XX":
                adata_dis_fil_cont = adata_dis_fil[adata_dis_fil.obs["genotype"] != "XY"]
            else:
                adata_dis_fil_cont = adata_dis_fil[adata_dis_fil.obs["genotype"] != "XX"]
            #print(adata_dis_fil_cont.obs["genotype"].unique())
            adata_ct = adata_dis_fil_cont[adata_dis_fil_cont.obs["predicted_cell_type"] == cell_type] # iterate through each cell type
            if len(adata_ct.obs["genotype"].value_counts()) < 2:
                continue
            #sc.pp.filter_genes(adata_ct, min_counts=1)
            #print(adata_ct)
            sc.pp.normalize_total(adata_ct)
            sc.pp.log1p(adata_ct)
            print(cell_type, geno, cont)
            print(adata_ct.obs["genotype"].value_counts())

            sc.tl.rank_genes_groups(adata_ct, 'genotype', groups=[cont], method='wilcoxon', key_added = "wilcoxon")
            #print(adata_ct)

            # Accessing dictionaries from adata_ct.uns['wilcoxon']
            wilcoxon_results = adata_ct.uns['wilcoxon']

            # Extracting names, pvals_adj, and logfoldchanges
            names = list(itertools.chain.from_iterable(wilcoxon_results['names'].tolist()))
            pvals = list(itertools.chain.from_iterable(wilcoxon_results['pvals'].tolist()))  
            pvals_adj = list(itertools.chain.from_iterable(wilcoxon_results['pvals_adj'].tolist())) 
            logfoldchanges = list(itertools.chain.from_iterable(wilcoxon_results['logfoldchanges'].tolist())) 
            
            # Creating a DataFrame
            results_df = pd.DataFrame({
                'Gene': names,
                'Log Fold Change': logfoldchanges,
                'P-value': pvals,
                'Adjusted P-Value': pvals_adj,
            })
            print(len(results_df))
            sc_wilcoxon = results_df
            print(len(sc_wilcoxon))
            sc_wilcoxon = pd.merge(sc_wilcoxon, var_test, on="Gene", how="left")
            print(len(sc_wilcoxon))
            sc_wilcoxon = pd.merge(sc_wilcoxon, par_df, on="Gene", how="left")
            print(len(sc_wilcoxon))
            sc_wilcoxon = sc_wilcoxon.sort_values('Gene')
            print(len(sc_wilcoxon))

            
            #print(sc_wilcoxon)
            xxy_xy = add_dataframes(xxy_xy, sc_wilcoxon, cell_type)

            # sc_wilcoxon = sc_wilcoxon[sc_wilcoxon["Adjusted P-Value"] < 0.01]
            # sc_wilcoxon = sc_wilcoxon.sort_values('Log Fold Change', ascending = False)
            # sc_head = sc_wilcoxon.head(20)
            # sc_head = sc_head[sc_head["Log Fold Change"] > 0]
            # sc_tail = sc_wilcoxon.tail(20)
            # sc_tail = sc_tail[sc_tail["Log Fold Change"] < 0]

            # sc_wilcoxon_non_sig = results_df
            # sc_wilcoxon_non_sig = pd.merge(sc_wilcoxon_non_sig, var_test, on="Gene", how="left")
            # sc_wilcoxon_non_sig = pd.merge(sc_wilcoxon_non_sig, par_df, on="Gene", how="left")
            # sc_wilcoxon_non_sig = sc_wilcoxon_non_sig.sort_values('Log Fold Change', ascending = False)

            # if len(sc_head) < 20:
            #     remainder_h = sc_wilcoxon_non_sig.head(20 - len(sc_head))
            #     # Combine DataFrames row-wise
            #     result_head = pd.concat([sc_head, remainder_h], axis=0, ignore_index=True)
            #     result_head.to_csv(f"D:/newgenes/results/wilcoxon/{cell_type}_{geno}_{cont}_head20.csv", index=False)

            # else:
            #     sc_head.to_csv(f"D:/newgenes/results/wilcoxon/{cell_type}_{geno}_{cont}_head20.csv", index=False)
            
            # if len(sc_tail) < 20:
            #     remainder_t = sc_wilcoxon_non_sig.tail(20 - len(sc_tail))
            #     # Combine DataFrames row-wise
            #     result_tail = pd.concat([sc_tail, remainder_t], axis=0, ignore_index=True)
            #     result_tail.to_csv(f"D:/newgenes/results/wilcoxon/{cell_type}_{geno}_{cont}_tail20.csv", index=False)

            # else:
            #     sc_tail.to_csv(f"D:/newgenes/results/wilcoxon/{cell_type}_{geno}_{cont}_tail20.csv", index=False)

            print(f"Number of statistically significant genes between {geno} and {cont} for cell type: {cell_type}", len(sc_wilcoxon)) # number of differentially expressed genes that are statistically significant             
            # sc_par = sc_wilcoxon_non_sig[sc_wilcoxon_non_sig["Is_escape"] == True]
            # sc_par.to_csv(f"D:/newgenes/results/wilcoxon/{cell_type}_{geno}_{cont}_escape.csv", index=False)


xxy_xy = pd.merge(xxy_xy, var_test, on="Gene", how="left")
xxy_xy = pd.merge(xxy_xy, par_df, on="Gene", how="left")
xxy_xy.to_csv("D:/newgenes/results/wilcoxon/xxy_xy.csv", index=False)

  view_to_actual(adata)


Astrocytes XXY XY
genotype
XY     175
XXY    143
Name: count, dtype: int64
28683
28683
28683
28683
28683
Number of statistically significant genes between XXY and XY for cell type: Astrocytes 28683


  view_to_actual(adata)


Excitatory neurons XXY XY
genotype
XY     49486
XXY    11745
Name: count, dtype: int64
28683
28683
28683
28683
28683
Number of statistically significant genes between XXY and XY for cell type: Excitatory neurons 28683
Granule neurons XXY XY
genotype
XXY    70
XY     20
Name: count, dtype: int64


  view_to_actual(adata)


28683
28683
28683
28683
28683
Number of statistically significant genes between XXY and XY for cell type: Granule neurons 28683
Inhibitory interneurons XXY XY
genotype
XXY    56
XY      9
Name: count, dtype: int64


  view_to_actual(adata)


28683
28683
28683
28683
28683
Number of statistically significant genes between XXY and XY for cell type: Inhibitory interneurons 28683


  view_to_actual(adata)


Inhibitory neurons XXY XY
genotype
XY     7437
XXY    2826
Name: count, dtype: int64
28683
28683
28683
28683
28683
Number of statistically significant genes between XXY and XY for cell type: Inhibitory neurons 28683


  view_to_actual(adata)


Microglia XXY XY
genotype
XY     42
XXY    24
Name: count, dtype: int64
28683
28683
28683
28683
28683
Number of statistically significant genes between XXY and XY for cell type: Microglia 28683


  view_to_actual(adata)


Oligodendrocytes XXY XY
genotype
XY     494
XXY    236
Name: count, dtype: int64
28683
28683
28683
28683
28683
Number of statistically significant genes between XXY and XY for cell type: Oligodendrocytes 28683


  view_to_actual(adata)


SLC24A4_PEX5L positive cells XXY XY
genotype
XXY    4
XY     2
Name: count, dtype: int64
28683
28683
28683
28683
28683
Number of statistically significant genes between XXY and XY for cell type: SLC24A4_PEX5L positive cells 28683


  view_to_actual(adata)


Unipolar brush cells XXY XY
genotype
XXY    14
XY     13
Name: count, dtype: int64
28683
28683
28683
28683
28683
Number of statistically significant genes between XXY and XY for cell type: Unipolar brush cells 28683


  view_to_actual(adata)


Vascular endothelial cells XXY XY
genotype
XY     546
XXY    153
Name: count, dtype: int64
28683
28683
28683
28683
28683
Number of statistically significant genes between XXY and XY for cell type: Vascular endothelial cells 28683
