In [1]:
import pandas as pd
import pysam
import csv
import sys
import re
import collections
import scanpy as sc
import os
import glob

import pandas as pd
from pandas import Series
import glob
import plotly
import plotly.express as px  # import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.io as pio
#pio.templates.default = 'plotly_white'

In [2]:
import numpy as np 

In [3]:
PATTERN = re.compile(r'(\S+?)\s*"(.*?)"')

In [4]:
    def get_properties_dict(properties_str):
        """
        allow no space after semicolon
        """
        
        if isinstance(properties_str, dict):
            return properties_str

        properties = collections.OrderedDict()
        attrs = properties_str.split(';')
        for attr in attrs:
            if attr:
                m = re.search(PATTERN, attr)
                if m:
                    key = m.group(1)
                    key = key.strip()
                    value = m.group(2)
                    value = value.strip()
                    properties[key] = value

        return properties

In [5]:
def gtf_reader_iter():
    with open("/SGRNJ06/randd/USER/cjj/filtered_ref/hs_ensembl_99/Homo_sapiens.GRCh38.99.gtf", mode='rt') as f:
        reader = csv.reader(f, delimiter='\t')
        for i, row in enumerate(reader, start=1):
            if len(row) == 0:
                continue
            if row[0].startswith('#'):
                yield row, True, None, None
                continue
            if len(row) != 9:
                sys.exit(f"Invalid number of columns in GTF line {i}: {row}\n")
            if row[6] not in ['+', '-']:
                sys.exit(f"Invalid strand in GTF line {i}: {row}\n")
            properties = get_properties_dict(row[8])
            annotation = row[2]
            if annotation == 'exon':
                if 'gene_id' not in properties:
                    raise GeneIdNotFound(f"Property 'gene_id' not found in GTF line {i}: {row}\n")
            yield row, False, annotation, properties

In [6]:
lncRNA_gene_set = set()
for _row, _is_comment, annotation, properties in gtf_reader_iter():
    if _is_comment:
        continue
    properties = dict(properties)
    if properties["gene_biotype"] == "lncRNA":
        gene_name = properties["gene_name"]
        lncRNA_gene_set.add(gene_name)
#     for key, value in properties.items():
#         if key == "gene_name":
#             gene_name = value
#         if value == "lncRNA":
#             lncRNA_gene_set.add(gene_name)

In [14]:
lncRNA_gene_set

True

In [8]:
dict(properties)

{'gene_id': 'ENSG00000268674',
 'gene_version': '2',
 'transcript_id': 'ENST00000601199',
 'transcript_version': '2',
 'exon_number': '1',
 'gene_name': 'AC213203.1',
 'gene_source': 'ensembl',
 'gene_biotype': 'protein_coding',
 'transcript_name': 'AC213203.1-201',
 'transcript_source': 'ensembl',
 'transcript_biotype': 'protein_coding',
 'tag': 'basic',
 'transcript_support_level': 'NA'}

In [9]:
len(lncRNA_gene_set)

16877

In [9]:
out = open("/SGRNJ06/randd/USER/cjj/celedev/rna/lncRNA/20231212/lncRNA.txt", 'w')
for i in lncRNA_gene_set:
    out.write(f"{i}\n")
out.close()

In [36]:
matrix_file = "/SGRNJ06/randd/USER/cjj/celedev/rna/lncRNA/20230830/G61459"

In [37]:
adata = sc.read_10x_mtx(matrix_file,var_names='gene_symbols',)

In [38]:
sample = matrix_file.split('/')[-1]

In [39]:
adata.var_names

Index(['MIR1302-2HG', 'FAM138A', 'OR4F5', 'AL627309.1', 'AL627309.3',
       'AL627309.2', 'AL627309.5', 'AL627309.4', 'AP006222.2', 'AL732372.1',
       ...
       'AC007325.2', 'BX072566.1', 'AL354822.1', 'AC023491.2', 'AC004556.3',
       'AC233755.2', 'AC233755.1', 'AC240274.1', 'AC213203.4', 'AC213203.1'],
      dtype='object', length=37493)

In [40]:
adata.var["lncRNA"] = adata.var_names.str.upper().isin(lncRNA_gene_set)

In [41]:
adata.var

Unnamed: 0,gene_ids,lncRNA
MIR1302-2HG,ENSG00000243485,True
FAM138A,ENSG00000237613,True
OR4F5,ENSG00000186092,False
AL627309.1,ENSG00000238009,True
AL627309.3,ENSG00000239945,True
...,...,...
AC233755.2,ENSG00000277856,False
AC233755.1,ENSG00000275063,False
AC240274.1,ENSG00000271254,False
AC213203.4,ENSG00000277475,False


In [42]:
target = adata.var[adata.var["lncRNA"]==True]

In [43]:
target = set(target.index)

In [44]:
len(target)

16815

In [45]:
df = adata.to_df()

In [46]:
df

Unnamed: 0,MIR1302-2HG,FAM138A,OR4F5,AL627309.1,AL627309.3,AL627309.2,AL627309.5,AL627309.4,AP006222.2,AL732372.1,...,AC007325.2,BX072566.1,AL354822.1,AC023491.2,AC004556.3,AC233755.2,AC233755.1,AC240274.1,AC213203.4,AC213203.1
AACACACAGAACGTCCAAGACCATCAA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AACACACAGACCGTACTCGTCCTGGTA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AACACACAGACGTTACGACTTAGCGAC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AACACACAGAGCCACATATAGTAGAGC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AACACACAGAGCCACATATGCTGCAGT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TGTGGACACTGACTTCCGTTCAAGCTG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TGTGGACACTGAGGCCTTCGTCAACTG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TGTGGACACTGAGGCCTTGACCATCAA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TGTGGACACTGGTAACCGGACGTAGAG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0


In [47]:
df = df.loc[:, lambda d: d.columns.isin(target)]

In [48]:
df = df.replace(0, np.nan)
df = df.dropna(how='all', axis=0)
df = df.dropna(how='all', axis=1)
df = df.replace(np.nan, 0)

In [49]:
df.shape[1]

7606

In [53]:
df.T

Unnamed: 0,AACACACAGAACGTCCAAGACCATCAA,AACACACAGACCGTACTCGTCCTGGTA,AACACACAGACGTTACGACTTAGCGAC,AACACACAGAGCCACATATAGTAGAGC,AACACACAGAGCCACATATGCTGCAGT,AACACACAGAGGCTGTTGGCACTATTC,AACACACAGATACCAGTCAACAGGAAC,AACACACAGATACCAGTCACTGAAGCA,AACACACAGATGCCTAAGTAGAGTCGC,AACACACAGATGTATCGGACGACGTAT,...,TGTGGACACTACTCACCATGCTGCAGT,TGTGGACACTATCGTGCACTAGGTTGC,TGTGGACACTCCTGGCTTTATGCTCTC,TGTGGACACTCTTCTCAGCAAGGTGTT,TGTGGACACTCTTCTCAGTAGAGTCGC,TGTGGACACTGACTTCCGTTCAAGCTG,TGTGGACACTGAGGCCTTCGTCAACTG,TGTGGACACTGAGGCCTTGACCATCAA,TGTGGACACTGGTAACCGGACGTAGAG,TGTGGACACTGTCGGTGTCGACCTTAT
AL627309.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL627309.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AP006222.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LINC01409,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAM87B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LINC01694,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL133493.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AJ011931.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FTCD-AS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
    df1 = df.T
    bc_set  = (set(df1.columns))
    result = []
    for bc in bc_set:
        result.append(total_lncRNA_gene - df1[bc].value_counts()[0])
    gene_per_cell.append(np.median(result))    

In [28]:
matrix_files = os.listdir("/SGRNJ06/randd/USER/cjj/celedev/rna/lncRNA/20231206/count")

In [29]:
matrix_files = [f"/SGRNJ06/randd/USER/cjj/celedev/rna/lncRNA/20231206/count/{i}" for i in matrix_files]

In [30]:
matrix_files = [i for i in matrix_files if os.path.isdir(i)]

In [31]:
matrix_files

['/SGRNJ06/randd/USER/cjj/celedev/rna/lncRNA/20231206/count/CR961431',
 '/SGRNJ06/randd/USER/cjj/celedev/rna/lncRNA/20231206/count/CL961431']

In [32]:
sample_list = []
total_lncRNA = []
gene_per_cell = []
for matrix_file in matrix_files:
    adata = sc.read_10x_mtx(matrix_file,var_names='gene_symbols',)
    sample = matrix_file.split('/')[-1]
    sample_list.append(sample)
    
    adata.var["lncRNA"] = adata.var_names.str.upper().isin(lncRNA_gene_set)
    target = adata.var[adata.var["lncRNA"]==True]
    target = set(target.index)
    df = adata.to_df()
    df = df.loc[:, lambda d: d.columns.isin(target)]
    df = df.replace(0, np.nan)
    df = df.dropna(how='all', axis=0)
    df = df.dropna(how='all', axis=1)
    df = df.replace(np.nan, 0)
    
    total_lncRNA_gene = df.shape[1]
    total_lncRNA.append(total_lncRNA_gene)
    
    
    df1 = df.T
    bc_set  = (set(df1.columns))
    result = []
    for bc in bc_set:
        result.append(total_lncRNA_gene - df1[bc].value_counts()[0])
    gene_per_cell.append(np.median(result))    

In [33]:
sample_list

['CR961431', 'CL961431']

In [34]:
total_lncRNA

[11447, 10674]

In [35]:
gene_per_cell

[55.0, 55.0]

In [None]:
adata = sc.read_h5ad("/SGRNJ06/randd/USER/cjj/celedev/rna/20231108/LPMFTBC231027012/04.analysis/LPMFTBC231027012.h5ad")

In [None]:
adata.obs.value_counts('cluster')