In [14]:
import os
import sys
import shutil
import random
import glob
from collections import Counter
import pandas as pd
import numpy as np
from scipy import stats
import subprocess
from multiprocessing import Pool

In [3]:
# Map m6a-related eQTL SNPs to the splicing sites
##############################################################################################
eqtl_dir = "/home/galaxy/data/QTL/m6a_related_SNPs/eQTL/m6a_qtl_bed/"
sp_bed = "/home/galaxy/data/splicing_sites/splice_sites_flank.bed"
result_dir = "/home/galaxy/project/QTL_analysis/related_with_splicing/eQTL"
##############################################################################################
eqtl_list = glob.glob("%s/*.bed" % eqtl_dir)
for eqtl in eqtl_list:
    result_file = os.path.join(result_dir, os.path.basename(eqtl).replace(".bed", ".bed"))
    os.system("bedtools intersect -a %s -b %s -wa -wb > %s" % (eqtl, sp_bed, result_file))

In [10]:
# Stat the number of gene isoforms
#~2min
#################################################################################
result_dir = "/home/galaxy/project/QTL_analysis/related_with_splicing"
human_gff = "/data/database/GRCh38/GENCODE/gencode.v27.annotation.gff3"
res_count = os.path.join(result_dir, "isoform_count.txt")
gene_bed = os.path.join(result_dir, "GRCh38_gene.bed")
# human_gff = "/home/galaxy/data/splicing_sites/test.gff3"
####################################################################################
names = ["Chr", "DB", "Feature", "Start", "End", "Pot_1", "Strand", "Pot_2", "Info"]
df = pd.read_table(human_gff, sep="\t", header=None, names=names, comment="#")
df["Gene_id"] = df["Info"].str.split("gene_id=").str[1].str.split(";").str[0]
df_sub = df[(df["Feature"] == "gene") | (df["Feature"] == "transcript")]
df_sub["dot"] = "."
df_bed = df_sub[["Chr", "Start", "End", "Gene_id", "dot", "Strand", "Feature"]]
# print(df_bed.head())
df_gene = df_bed[df_bed["Feature"] == "gene"][["Chr", "Start", "End", "Gene_id", "dot", "Strand"]]
df_gene.to_csv(gene_bed, sep="\t", header=False, index=False)
#
df_res = df_bed.groupby(["Gene_id"]).apply(count_isoform)
df_res.to_csv(res_count, sep="\t", header=False) # , index=False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [8]:
def count_isoform(df):
    count = 0
    f_list = df["Feature"].tolist()
    for x in f_list:
        if x == "transcript":
            count += 1
    return count

In [32]:
# eQTL SNPs overlap with Gene
result_dir = "/home/galaxy/project/QTL_analysis/related_with_splicing/eQTL/snp_and_isoform"
########
data_dir = "/home/galaxy/project/QTL_analysis/related_with_splicing/"
res_count = os.path.join(data_dir, "isoform_count.txt")
df_count = pd.read_table(res_count, sep="\t", header=None, names=["Gene ID", "Number"])
iso_dict = dict(zip(df_count["Gene ID"], df_count["Number"]))
gene_bed = os.path.join(data_dir, "GRCh38_gene.bed")
#######
eqtl_dir = "/home/galaxy/data/QTL/m6a_related_SNPs/eQTL/m6a_qtl_bed/"
##############################################################################################
eqtl_list = glob.glob("%s/*.bed" % eqtl_dir)
for eqtl in eqtl_list:
    result_file = os.path.join(result_dir, os.path.basename(eqtl).replace(".bed", ".txt"))
    command = "bedtools intersect -a %s -b %s -wa -wb | cut -f 8" % (eqtl, gene_bed)
    sub_p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
    genes_str = str(sub_p.communicate()[0].decode('utf-8').strip()) #
    gene_list = [x.strip() for x in genes_str.split("\n")] #
    num_dict = Counter(gene_list)
    snps, isos = [], []
    for gene, snp_num in dict(num_dict).items():
        if gene in iso_dict:
            iso_num = iso_dict[gene]
            isos.append(str(iso_num))
            snps.append(str(snp_num))
#                 fw.write("%s\t%s\n" % (str(iso_num), str(snp_num)))
        else:
            print("%s not in isoform_count.txt" % gene)
    res_dict = {}
    for i in range(len(isos)):
        res_dict[isos[i]] = res_dict.get(isos[i], []) + [int(snps[i])]
    with open(result_file, 'w') as fw:
        for iso, snp in res_dict.items():
            fw.write("%s\t%s\n" % (iso, str(sum(snp))))