# Aim of the script

In this script, all the reads comming from the cutadapt steps are processed in order to generate tables linking the different cells to their enhancer. <br>
- First, it converts fastq reads to pandas dataframe.
- Second, it removes all rows with a different enhancer barcode than the true one. Plus it removes the rows with a different cell barcode to the ones found in the selected clustering version.
- Third, it computes for each cell barcode the different enhancer barcodes found and their respective frequency. It also select the first and second most common enhancer barcodes found.
- Last, it generates a cell-enhancer pairs table used by Seurat and novoSpaRc. <br>

Because it is quite heavy to run, two shortcuts to reload the processed dataframes are present in the script after step 1 and 3

The input files are stored in the **data/cutadapt_output** folder. <br> 
The output files are generated and stored in the **data/cell_enhancer_pairs** folder.

## Loading packages

In [1]:
from collections import Counter
import collections
from Bio import SeqIO
from Bio import AlignIO
from Bio import Align
import pandas
import numpy
import matplotlib.pyplot as plt
import csv

## Loading cells and enhancers dictionaries

**dico_enhancers** is a table containing the 25 true enhancer barcodes. <br>
**dico_cells** are tables containing the cell barcodes present after the seurat analysis. Either with the full or the reduced clustering.

In [2]:
dico_enhancers=pandas.read_csv("dico_enhancers_python.tsv",sep="\t",header=None, names=["noms","seq"])
dico_cells_full=pandas.read_csv("../../R_analyses/id_cells_6_int_full_clusters.csv")
dico_cells_reduced=pandas.read_csv("../../R_analyses/id_cells_6_int_reduced_clusters.csv")

## Step1 - Converting fastq reads to pandas dataframe

In [4]:
id=[]
seq=[]
for record1 in SeqIO.parse("../../data/cutadapt_output/trimming_libS6.1/E_BC.fastq", "fastq"):
    id.append(record1.id)
    seq.append(str(record1.seq))
df_id_enhancer=pandas.DataFrame({'id':id,'enhancer_BC':seq})
df_id_enhancer.to_csv('preprocessed_tables/DF_reads_ID_enhancer_6_1.tsv',sep='\t',index=False)

In [5]:
id=[]
seq=[]
for record1 in SeqIO.parse("../../data/cutadapt_output/trimming_libS6.2/E_BC.fastq", "fastq"):
    id.append(record1.id)
    seq.append(str(record1.seq))
df_id_enhancer=pandas.DataFrame({'id':id,'enhancer_BC':seq})
df_id_enhancer.to_csv('preprocessed_tables/DF_reads_ID_enhancer_6_2.tsv',sep='\t',index=False)

In [6]:
id=[]
seq=[]
for record1 in SeqIO.parse("../../data/cutadapt_output/trimming_libS6.3/E_BC.fastq", "fastq"):
    id.append(record1.id)
    seq.append(str(record1.seq))
df_id_enhancer=pandas.DataFrame({'id':id,'enhancer_BC':seq})
df_id_enhancer.to_csv('preprocessed_tables/DF_reads_ID_enhancer_6_3.tsv',sep='\t',index=False)

In [7]:
id=[]
cell=[]
UMI=[]
for record2 in SeqIO.parse("../../data/cutadapt_output/trimming_libS6.1/C_BC.fastq", "fastq"):
    id.append(record2.id)
    seq=str(record2.seq)
    cell.append("rep1_"+seq[:16])
    UMI.append(seq[16:])
df_id_cell_umi=pandas.DataFrame({'id':id,'cell_BC':cell,'UMI':UMI})
df_id_cell_umi.to_csv('preprocessed_tables/DF_reads_ID_cell_UMI_6_1.tsv',sep="\t", index=False)

In [8]:
id=[]
cell=[]
UMI=[]
for record2 in SeqIO.parse("../../data/cutadapt_output/trimming_libS6.2/C_BC.fastq", "fastq"):
    id.append(record2.id)
    seq=str(record2.seq)
    cell.append("rep2_"+seq[:16])
    UMI.append(seq[16:])
df_id_cell_umi=pandas.DataFrame({'id':id,'cell_BC':cell,'UMI':UMI})
df_id_cell_umi.to_csv('preprocessed_tables/DF_reads_ID_cell_UMI_6_2.tsv',sep="\t", index=False)

In [9]:
id=[]
cell=[]
UMI=[]
for record2 in SeqIO.parse("../../data/cutadapt_output/trimming_libS6.3/C_BC.fastq", "fastq"):
    id.append(record2.id)
    seq=str(record2.seq)
    cell.append("rep3_"+seq[:16])
    UMI.append(seq[16:])
df_id_cell_umi=pandas.DataFrame({'id':id,'cell_BC':cell,'UMI':UMI})
df_id_cell_umi.to_csv('preprocessed_tables/DF_reads_ID_cell_UMI_6_3.tsv',sep="\t", index=False)

### Shortcut 1

In [10]:
df_id_enhancer_6_1=pandas.read_csv('preprocessed_tables/DF_reads_ID_enhancer_6_1.tsv',sep='\t')
df_id_cell_umi_6_1=pandas.read_csv('preprocessed_tables/DF_reads_ID_cell_UMI_6_1.tsv',sep='\t')
df_id_enhancer_6_2=pandas.read_csv('preprocessed_tables/DF_reads_ID_enhancer_6_2.tsv',sep='\t')
df_id_cell_umi_6_2=pandas.read_csv('preprocessed_tables/DF_reads_ID_cell_UMI_6_2.tsv',sep='\t')
df_id_enhancer_6_3=pandas.read_csv('preprocessed_tables/DF_reads_ID_enhancer_6_3.tsv',sep='\t')
df_id_cell_umi_6_3=pandas.read_csv('preprocessed_tables/DF_reads_ID_cell_UMI_6_3.tsv',sep='\t')

## Step 2 - removing rows based on known cells and enhancer barcodes

In [11]:
df_id_enhancer_cell_umi_6_1=pandas.merge(df_id_enhancer_6_1,df_id_cell_umi_6_1,on='id')
df_id_enhancer_cell_umi_6_2=pandas.merge(df_id_enhancer_6_2,df_id_cell_umi_6_2,on='id')
df_id_enhancer_cell_umi_6_3=pandas.merge(df_id_enhancer_6_3,df_id_cell_umi_6_3,on='id')

Removing the rows with a wrong enhancer barcode

In [12]:
df_id_enhancer_cell_umi_6_1=df_id_enhancer_cell_umi_6_1[df_id_enhancer_cell_umi_6_1['enhancer_BC'].isin(dico_enhancers['seq'])]
df_id_enhancer_cell_umi_6_2=df_id_enhancer_cell_umi_6_2[df_id_enhancer_cell_umi_6_2['enhancer_BC'].isin(dico_enhancers['seq'])]
df_id_enhancer_cell_umi_6_3=df_id_enhancer_cell_umi_6_3[df_id_enhancer_cell_umi_6_3['enhancer_BC'].isin(dico_enhancers['seq'])]

Removing the rows with a cell barcode not found in the seurat analysis. Here you can select if you are working on the full or reduced clustering

In [13]:
df_id_enhancer_cell_umi_sc_cells_6_1_full=df_id_enhancer_cell_umi_6_1[df_id_enhancer_cell_umi_6_1['cell_BC'].isin(dico_cells_full["x"].tolist())]
df_id_enhancer_cell_umi_sc_cells_6_2_full=df_id_enhancer_cell_umi_6_2[df_id_enhancer_cell_umi_6_2['cell_BC'].isin(dico_cells_full["x"].tolist())]
df_id_enhancer_cell_umi_sc_cells_6_3_full=df_id_enhancer_cell_umi_6_3[df_id_enhancer_cell_umi_6_3['cell_BC'].isin(dico_cells_full["x"].tolist())]

In [14]:
df_id_enhancer_cell_umi_sc_cells_6_1_reduced=df_id_enhancer_cell_umi_6_1[df_id_enhancer_cell_umi_6_1['cell_BC'].isin(dico_cells_reduced["x"].tolist())]
df_id_enhancer_cell_umi_sc_cells_6_2_reduced=df_id_enhancer_cell_umi_6_2[df_id_enhancer_cell_umi_6_2['cell_BC'].isin(dico_cells_reduced["x"].tolist())]
df_id_enhancer_cell_umi_sc_cells_6_3_reduced=df_id_enhancer_cell_umi_6_3[df_id_enhancer_cell_umi_6_3['cell_BC'].isin(dico_cells_reduced["x"].tolist())]

## Step 3 - Listing the enhancer barcodes found for each cells

For this function you have to select both the selected seurat clustering version (full/reduced) and how to process the samples (individual/merged). <br>
The function will then generates a table recapitalting enhancer information for each cell barcode.

In [33]:
def step3(library_processing="individual",seurat_clustering_version="full"):
    if (library_processing=="individual") & (seurat_clustering_version=="full"):
        list_obj=[df_id_enhancer_cell_umi_sc_cells_6_1_full,df_id_enhancer_cell_umi_sc_cells_6_2_full,df_id_enhancer_cell_umi_sc_cells_6_3_full]
    elif (library_processing=="merged") & (seurat_clustering_version=="full"):
        list_obj=[pandas.concat([df_id_enhancer_cell_umi_sc_cells_6_1_full,df_id_enhancer_cell_umi_sc_cells_6_2_full,df_id_enhancer_cell_umi_sc_cells_6_3_full])]
    elif (library_processing=="individual") & (seurat_clustering_version=="reduced"):
        list_obj=[df_id_enhancer_cell_umi_sc_cells_6_1_reduced,df_id_enhancer_cell_umi_sc_cells_6_2_reduced,df_id_enhancer_cell_umi_sc_cells_6_3_reduced]
    elif (library_processing=="merged") & (seurat_clustering_version=="reduced"):
        list_obj=[pandas.concat([df_id_enhancer_cell_umi_sc_cells_6_1_reduced,df_id_enhancer_cell_umi_sc_cells_6_2_reduced,df_id_enhancer_cell_umi_sc_cells_6_3_reduced])]
    j=1
    for object in list_obj:
        df_cell_enhancer_freq=pandas.DataFrame(columns=['cell_BC','list_enhancer_BC','nb_enhancer_BC','1st_enhancer_BC','freq_1st_enhancer_BC','2nd_enhancer_BC','freq_2nd_enhancer_BC'])
        i=0
        for cell_BC in object['cell_BC'].unique():
            i=i+1
            freq_seq_enhancers=Counter(object['enhancer_BC'][object['cell_BC']==cell_BC])
            nb=len(freq_seq_enhancers)
            sorted_freq_seq = sorted(freq_seq_enhancers.items(), key=lambda x:x[1])
            if nb==0:
                continue
            elif nb == 1 & sorted_freq_seq[0][1]>5:
                list_enhancer_bc=freq_seq_enhancers.keys()
                first_elem=sorted_freq_seq[0]
                df_cell_enhancer_freq.loc[len(df_cell_enhancer_freq)]=[cell_BC,list_enhancer_bc,nb,first_elem[0],first_elem[1],"null",0]
            elif nb>=2:
                list_enhancer_bc=freq_seq_enhancers.keys()
                first_elem=sorted_freq_seq[-1]
                second_elem=sorted_freq_seq[-2]
                if first_elem[1]>=second_elem[1]*10:
                    df_cell_enhancer_freq.loc[len(df_cell_enhancer_freq)]=[cell_BC,list_enhancer_bc,nb,first_elem[0],first_elem[1],second_elem[0],second_elem[1]]
        l1=[]
        l2=[]
        for i in range(len(df_cell_enhancer_freq)):
            l1.append(dico_enhancers['noms'][df_cell_enhancer_freq.loc[i,"1st_enhancer_BC"]==dico_enhancers['seq']].item())
            l2.append(dico_enhancers['noms'][df_cell_enhancer_freq.loc[i,"2nd_enhancer_BC"]==dico_enhancers['seq']].item())
        df_cell_enhancer_freq['nom_1st_enhancer']=l1
        df_cell_enhancer_freq['nom_2nd_enhancer']=l2
        df_cell_enhancer_freq.to_csv(path_or_buf=library_processing+"_"+seurat_clustering_version+"/PCR_table_rep"+str(j)+".tsv",sep='\t',index=False)
        j+=1

In [34]:
step3("full","merged")

je commence
object
j'y suis presque


OSError: Cannot save file into a non-existent directory: 'full_merged'

### Shortcut 2

In [None]:
#big_table_6_1_full=pandas.read_csv('../../data/trimming_PCR/PCR_table_full_individual_rep1.tsv',sep='\t')
#big_table_6_2_full=pandas.read_csv('../../data/trimming_PCR/PCR_table_full_individual_rep2.tsv',sep='\t')
#big_table_6_3_full=pandas.read_csv('../../data/trimming_PCR/PCR_table_full_individual_rep3.tsv',sep='\t')
big_table_merged_full=pandas.read_csv('merged_full/PCR_table.tsv',sep='\t')

#big_table_6_1_reduced=pandas.read_csv('../../data/trimming_PCR/PCR_table_reduced_individual_rep1.tsv',sep='\t')
#big_table_6_2_reduced=pandas.read_csv('../../data/trimming_PCR/PCR_table_reduced_individual_rep2.tsv',sep='\t')
#big_table_6_3_reduced=pandas.read_csv('../../data/trimming_PCR/PCRtable_reduced_individual_rep3.tsv',sep='\t')
big_table_merged_reduced=pandas.read_csv('merged_reduced/PCR_table.tsv',sep='\t')

### Generates the frequency tables
Again just run the lines you need depending on the files generated previously

In [None]:
freq_enhancers_6_1_full=Counter(big_table_6_1_full['nom_1st_enhancer'])
freq_enhancers_6_2_full=Counter(big_table_6_2_full['nom_1st_enhancer'])
freq_enhancers_6_3_full=Counter(big_table_6_3_full['nom_1st_enhancer'])
freq_enhancers_merged_full=Counter(big_table_merged_full['nom_1st_enhancer'])

freq_enhancers_6_1_reduced=Counter(big_table_6_1_reduced['nom_1st_enhancer'])
freq_enhancers_6_2_reduced=Counter(big_table_6_2_reduced['nom_1st_enhancer'])
freq_enhancers_6_3_reduced=Counter(big_table_6_3_reduced['nom_1st_enhancer'])
freq_enhancers_merged_reduced=Counter(big_table_merged_reduced['nom_1st_enhancer'])

## Step 4 - output formating

This last step is necessary to generate the table used by Seurat and novoSpaRc

In [None]:
def step4(seurat_clustering_version="full",library_processing="individual"):
    if library_processing=="individual" & seurat_clustering_version=="full":
        list_big=[big_table_6_1_full,big_table_6_2_full,big_table_6_3_full]
        list_freq=[freq_enhancers_6_1_full,freq_enhancers_6_2_full,freq_enhancers_6_3_full]
    elif library_processing=="merged" & seurat_clustering_version=="full":
        list_big=[big_table_merged_full]
        list_freq=[freq_enhancers_merged_full]
    elif library_processing=="individual" & seurat_clustering_version=="reduced":
        list_big=[big_table_6_1_reduced,big_table_6_2_reduced,big_table_6_3_reduced]
        list_freq=[freq_enhancers_6_1_reduced,freq_enhancers_6_2_reduced,freq_enhancers_6_3_reduced]
    elif library_processing=="merged" & seurat_clustering_version=="reduced":
        list_big=[big_table_merged_reduced]
        list_freq=[freq_enhancers_merged_reduced]

    i=0
    for object in list_freq:
        df_list_cells_with_enhancer=pandas.DataFrame(columns=["enhancer","liste"])
        for cle in object.keys():
            list_cell = list_big[i]['cell_BC'][list_big[i]['nom_1st_enhancer']==cle].to_list()
            new_list=[]
            new_list=str(list_cell).replace("[","").replace("'","").replace(" ","").replace("]","").replace("\"","")
            df_list_cells_with_enhancer.loc[len(df_list_cells_with_enhancer)]=[cle,new_list]
        df_list_cells_with_enhancer.to_csv('../../data/trimming_PCR/cell_enhancer_pairs_'+seurat_clustering_version+'_'+library_processing+'_rep'+str(i+1)+'.tsv',sep='\t',index=False)
        i+=1