In [None]:
##### rsID to var ID conversion using genopyc tool 

In [None]:
import genopyc as gp 
import pandas as pd

In [None]:
df = pd.read_csv("/nfs/amishra/Arpit/rsID18k_onecol.csv")

In [None]:
df = df[["rsid"]]

In [None]:
variants_to_study_ot = gp.variantId_mapping(df.rsid, source='rsid',target= 'variantid')

In [None]:
variantdf = pd.DataFrame(variants_to_study_ot)
variantdf.columns = ["variant"]
variantdf.to_csv("/nfs/amishra/Arpit/18k_rsid_converted_to15kvariant.csv")

In [None]:
##### Var id to target gene using V2G otargen pipeline this is R code snippet 

# Load necessary libraries
library(otargen)
library(purrr)
library(dplyr)
library(readr)



# Extract the variant IDs into a vector
variant_ids <- df$variant

# Define a function to retrieve gene information for a variant ID
get_gene_info <- function(variant_id) {
  genesForVariant(variant_id = variant_id)
}

# Wrap the function with purrr::safely to handle errors gracefully
safe_get_gene_info <- safely(get_gene_info)

# Apply the safe function to the list of variant IDs
results <- map(variant_ids, safe_get_gene_info)

# Extract the results (successful calls)
successful_results <- map(results, "result")

# Optionally, filter out the NULL results (where the function failed)
successful_results_filtered <- compact(successful_results)


# Combine results into a data frame if needed
final_results <- bind_rows(successful_results_filtered)

# Print or save the results
print(final_results)
saveRDS(successful_results_filtered, file = "/nfs/amishra/Arpit/15kv2g.rds")
flat_results <- unlist(successful_results_filtered, recursive = FALSE)
results_df <- bind_rows(flat_results)

# Write results to a CSV file
write.csv(results_df, file = "/nfs/amishra/Arpit/MS_atac_cutandrun_diffpeak/15kmprabackground_multivariantcall4.csv", row.names = FALSE)

In [None]:
####list of genes expressin in any t cell types from DICE to filter t cell expressing genes 

In [None]:
import pandas as pd
import os
from urllib.parse import urlparse

# List of file URLs
file_urls = [
    "https://dice-database.org/download/CD4_NAIVE_TPM.csv",
    "https://dice-database.org/download/CD8_NAIVE_TPM.csv",
    "https://dice-database.org/download/CD8_STIM_TPM.csv",
    "https://dice-database.org/download/CD4_STIM_TPM.csv",
    "https://dice-database.org/download/TH2_TPM.csv",
    "https://dice-database.org/download/TH17_TPM.csv",
    "https://dice-database.org/download/THSTAR_TPM.csv",
    "https://dice-database.org/download/TREG_MEM_TPM.csv",
    "https://dice-database.org/download/TREG_NAIVE_TPM.csv",
    "https://dice-database.org/download/TFH_TPM.csv"]
# Initialize an empty list to store DataFrame for each file
data_frames = []

for url in file_urls:
    # Extract file name from URL
    parsed_url = urlparse(url)
    file_name = os.path.basename(parsed_url.path)
    
    # Download or read the CSV file into a DataFrame
    # For illustration, assuming already downloaded files are being read
    df = pd.read_csv(file_name, index_col=None)
    
    # Process the DataFrame as per the example
    df[["Genename", "Type"]] = df.Additional_annotations.str.split(";", expand=True)
    dfTPM = df.iloc[:, 3:-1]  # Adjust columns selection based on your data
    dfTPM["MedianTPM"] = dfTPM.median(numeric_only=True, axis=1)
    df_medianTPM = dfTPM[["Genename", "MedianTPM"]]
    df_medianTPM_grtoreq1 = df_medianTPM[df_medianTPM["MedianTPM"] >= 1]
    df_medianTPM_grtoreq1.columns = ['geneSymbol', 'MedianTPM']
    
    # Add a column indicating the source DataFrame
    df_medianTPM_grtoreq1['Source'] = file_name  # Or any identifier you prefer
    
    # Append to the list of data frames
    data_frames.append(df_medianTPM_grtoreq1)

# Merge all the data frames into a single DataFrame
merged_df = pd.concat(data_frames, ignore_index=True)

# Optionally, sort by geneSymbol or any other column if needed
merged_df = merged_df.sort_values(by='geneSymbol')

# Print the merged DataFrame
print(merged_df)

# Write the merged DataFrame to a CSV file
merged_df.to_csv("/nfs/amishra/Arpit/Tcell_DICEpopulation_merged_medianTPM_data.csv", index=False)


In [None]:
#### polars to filter v2g gene list based on relevant cell type expression 

import polars as pl


# Load the CSV file with Polars
df = pl.read_csv("/nfs/amishra/Arpit/MS_atac_cutandrun_diffpeak/15kmprabackground_multivariantcall4.csv")

# Filter out rows where 'typeId' is not null
dfx = df.filter(pl.col('typeId').is_not_null())

# Select relevant columns
dfy = dfx.select(['gene.symbol', 'variant', 'tissues_distance', 'sourceId', 'tissues_name', 'typeId'])

# Create the 'Combined' column
dfy = dfy.with_column((pl.col('variant').cast(pl.Utf8) + "_" + pl.col('gene.symbol').cast(pl.Utf8)).alias('Combined'))


# List of tissues names to match
tissue_names = [
    "Blood (eQTLGen)", "Blood (GTEx v8)", "Blood (Lepik 2017)", "Blood (TwinsUK)", "Blood plasma (Sun2018)",
    "CD4+ T cell (Blueprint)", "CD4+ T cell (CEDAR)", "CD4+ T cell (Kasela 2017)", "CD4+ T cell (Schmiedel 2018)",
    "CD4+ T cell anti-CD3-CD28 4h (Schmiedel 2018)", "CD8+ T cell (CEDAR)", "CD8+ T cell (Kasela 2017)",
    "CD8+ T cell (Schmiedel 2018)", "CD8+ T cell anti-CD3-CD28 4h (Schmiedel 2018)", "Foetal thymus",
    "Gtex-sqtl-whole blood", "Naive CD4", "Naive CD8", "T cell (GENCORD)", "Tfh cell (Schmiedel 2018)",
    "Th1 cell (Schmiedel 2018)", "Th17 cell (Schmiedel 2018)", "Th2 cell (Schmiedel 2018)", "Thymus",
    "Total CD4 Activated", "Total CD4 MF", "Total CD4 NonActivated", "Total CD8", "Treg (Schmiedel 2018)",
    "Treg memory (Schmiedel 2018)"
]

# Type IDs to filter
type_ids = ["eqtl", "pqtl", "sqtl", "pchic"]

# Filter rows
filtered_df = dfy.filter(
    ((pl.col("typeId").is_in(type_ids)) & (pl.col("tissues_name").is_in(tissue_names))) |
    (~pl.col("typeId").is_in(type_ids))
)### initially thought of filtering here but it will throw away the distance v2g because they dont have tissue name thus took both filtered and non filtered and filtered later in pandas to remove NAs 

# Display the filtered DataFrame 
print(filtered_df)
filtered_df.write_csv("/nfs/amishra/Arpit/Raylab_Network_analysis/15kvariant_v2goutput.csv")
unique_genes = filtered_df.select(pl.col("gene.symbol").unique())
unique_genes.write_csv("/nfs/amishra/Arpit/raylab15kvariant_uniquegenesbasedontcellbloodfilter.csv")

### by overlapping v2g output with T cell expressed genes we generate master v2g list. 


In [None]:
#### using master v2g output difference background and foreground creation 

In [None]:
##### run this chunk with genopyc kernel
import genopyc as gp 
import pandas as pd
### prepare v2g df from v2g pipeline output, and prepare var_emvar_dhs etc input from Ray lab file "240626_TGWAS_paper_rsids_per_screen_results.xlsx""######
v2g15k = pd.read_csv("/nfs/amishra/Arpit/Raylab_Network_analysis/15kvariant_v2goutput.csv")
v2g15k= v2g15k[v2g15k['typeId'].notna()].copy()#### remove NAs 
dhs_emvars = pd.read_csv("/nfs/amishra/Arpit/Raylab_Network_analysis/sanity_checkrun07282024/240626_TGWAS_paper_rsids_per_screen_results_JR (1).csv")
##### prepare expression filter using DICE database 
dice_exp = pd.read_csv("/nfs/amishra/Arpit/Tcell_DICEpopulation_merged_medianTPM_data.csv")
dice_exp.columns = ['gene.symbol', 'MedianTPM', 'Source']

##### prepare proliferation network background (proliferationbg) using emvars_dhs_in_sc using all 56 scCRISPR v2g hits
dhs_emvars_56 = dhs_emvars[["emvars_dhs_in_sc_screens"]]
dhs_emvars_56 = dhs_emvars_56.dropna()
dhs_emvars_56_varid = gp.variantId_mapping(dhs_emvars_56.emvars_dhs_in_sc_screens, source="rsid", target="variantid")
dhs_emvars_56_varid=list(filter(None,dhs_emvars_56_varid))
dhs_emvars_56_varid_df = pd.DataFrame(dhs_emvars_56_varid)
dhs_emvars_56_varid_df.columns = ["variant"]
dhs_emvars_56_varid_v2g = v2g15k.merge(dhs_emvars_56_varid_df, how="inner", on="variant")
dhs_emvars_56_varid_v2g_unique_genes = dhs_emvars_56_varid_v2g["gene.symbol"].unique()
dhs_emvars_56_varid_v2g_unique_genes_df = pd.DataFrame(dhs_emvars_56_varid_v2g_unique_genes, columns=["gene.symbol"])
proliferation_bg_dice = dice_exp.merge(dhs_emvars_56_varid_v2g_unique_genes_df, how="inner", on="gene.symbol")
proliferation_bg_dice_uniquegenes = proliferation_bg_dice["gene.symbol"].unique()
proliferation_bg_dice_uniquegenes_df = pd.DataFrame(proliferation_bg_dice_uniquegenes, columns=["gene.symbol"])
proliferation_bg_dice_uniquegenes_df.to_csv("/nfs/amishra/Arpit/Raylab_Network_analysis/sanity_checkrun07282024/sanitycheck_newlistJR_1/proliferation_hit_dice_uniquegenesbg.csv")

##### prepare proliferation network foreground (proliferationfg) using "emVars+DHS+Prolif (FDR<0.1)" for only prol hits 
dhs_emvars_prol17 = dhs_emvars[["emVars+DHS+Prolif (FDR<0.1)"]]
dhs_emvars_prol17 = dhs_emvars_prol17.dropna()
dhs_emvars_prol17_varid = gp.variantId_mapping(dhs_emvars_prol17["emVars+DHS+Prolif (FDR<0.1)"], source='rsid',target = 'variantid')
dhs_emvars_prol17_varid_df = pd.DataFrame(dhs_emvars_prol17_varid)
dhs_emvars_prol17_varid_df.columns = ["variant"]
dhs_emvars_prol17_varid_v2g = v2g15k.merge(dhs_emvars_prol17_varid_df, how="inner", on="variant")
dhs_emvars_prol17_uniquegenes = dhs_emvars_prol17_varid_v2g["gene.symbol"].unique()
dhs_emvars_prol17_unique_genes_df = pd.DataFrame(dhs_emvars_prol17_uniquegenes, columns=["gene.symbol"])
#dhs_emvars_prol17_unique_genes_df.to_csv("/nfs/amishra/Arpit/Raylab_Network_analysis/dhs_emvars_prol17_filtered_v2g.csv")
proliferation_fg_dice = dice_exp.merge(dhs_emvars_prol17_unique_genes_df, how="inner", on="gene.symbol")
proliferation_fg_dice_uniquegenes = proliferation_fg_dice["gene.symbol"].unique()
proliferation_fg_dice_uniquegenes_df = pd.DataFrame(proliferation_fg_dice_uniquegenes, columns=["gene.symbol"])
proliferation_fg_dice_uniquegenes_df.to_csv("/nfs/amishra/Arpit/Raylab_Network_analysis/sanity_checkrun07282024/sanitycheck_newlistJR_1/proliferation_hit_dice_uniquegenesfg.csv")

##### prepare big network background(bignetworkbg) using all the variants in DHS v2g output 
dhs_var = dhs_emvars[["all_t_cell_DHS_var"]]
dhs_var_varid = gp.variantId_mapping(dhs_var["all_t_cell_DHS_var"], source='rsid',target = 'variantid')
dhs_var_varid_df = pd.DataFrame(dhs_var_varid)
dhs_var_varid_df.columns = ["variant"]
dhs_var_varid_df_v2g = v2g15k.merge(dhs_var_varid_df, how="inner", on="variant")
dhs_var_varid_df_v2g_uniquegenes = dhs_var_varid_df_v2g["gene.symbol"].unique()
dhs_var_varid_df_v2g_uniquegenes_df = pd.DataFrame(dhs_var_varid_df_v2g_uniquegenes, columns=["gene.symbol"])
#dhs_var_varid_df_v2g_uniquegenes_df.to_csv("/nfs/amishra/Arpit/Raylab_Network_analysis/tcellsdhs_vars_filtered_v2g.csv")
var_inDHS_andDICE = dice_exp.merge(dhs_var_varid_df_v2g_uniquegenes_df, how="inner", on="gene.symbol")
var_inDHS_andDICEv2g_uniquegenes = var_inDHS_andDICE["gene.symbol"].unique()
var_inDHS_andDICEv2g_uniquegenes_df = pd.DataFrame(var_inDHS_andDICEv2g_uniquegenes, columns=["gene.symbol"])
var_inDHS_andDICEv2g_uniquegenes_df.to_csv("/nfs/amishra/Arpit/Raylab_Network_analysis/sanity_checkrun07282024/sanitycheck_newlistJR_1/var_inDHSandDICE_uniquegenes_bignetworkbg.csv")

###### prepare big network foreground (bignetworkfg) using emvars in DHS v2g output 
emvar_in_DHS = dhs_emvars[["emVars+DHS"]]
emvar_in_DHS = emvar_in_DHS.dropna()
emvar_in_DHSvarid = gp.variantId_mapping(emvar_in_DHS["emVars+DHS"], source='rsid',target = 'variantid')
emvar_in_DHSvarid_df = pd.DataFrame(emvar_in_DHSvarid)
emvar_in_DHSvarid_df.columns = ["variant"]
emvar_in_DHSvarid_df_v2g = v2g15k.merge(emvar_in_DHSvarid_df, how="inner", on="variant")
emvar_in_DHSvarid_df_v2g_uniquegenes = emvar_in_DHSvarid_df_v2g["gene.symbol"].unique()
emvar_in_DHSvarid_df_v2g_uniquegenes_df = pd.DataFrame(emvar_in_DHSvarid_df_v2g_uniquegenes, columns=["gene.symbol"])
emvar_inDHS_andDICE = dice_exp.merge(emvar_in_DHSvarid_df_v2g_uniquegenes_df, how="inner",on="gene.symbol")
emvar_inDHS_andDICE_uniquegenes = emvar_inDHS_andDICE["gene.symbol"].unique()
emvar_inDHS_andDICE_uniquegenes_df = pd.DataFrame(emvar_inDHS_andDICE_uniquegenes, columns=["gene.symbol"])
emvar_inDHS_andDICE_uniquegenes_df.to_csv("/nfs/amishra/Arpit/Raylab_Network_analysis/sanity_checkrun07282024/sanitycheck_newlistJR_1/emvar_inDHSandDICE_uniquegenes_bignetworkfg.csv")


