## Load in Nagy transcriptome

In [2]:
from Bio import SeqIO

Nagy_transcriptome = list(SeqIO.parse("../../../data-reference/transcriptomes/Trifolium-repens/Nagy/Nagy_Transcriptome.fasta", "fasta"))

In [3]:
# Get record IDs as list
Nagy_transcriptome_ids = [record.id for record in Nagy_transcriptome]

## Load in ID's of transcripts successfully blasted to protein databases of closely related species. 

I will compare the IDs in the transcriptome to those that successfully blasted to get a sense of how many of the transcript sequences are contained in the protein databases of the closely related species

In [122]:
Metr_Qseqids = list(pd.read_table("../../data-clean/020_blastx/Medicago-truncatula/Nagy_transcriptome_ProteinBlast_Filtered.csv", 
                                 sep = ",",usecols = ["qseqid"])["qseqid"])
Trpr_Qseqids = list(pd.read_table("../../data-clean/020_blastx/Trifolium-pratense/Nagy_transcriptome_ProteinBlast_Filtered.csv", 
                                 sep = ",",usecols = ["qseqid"])["qseqid"])
Trsu_Qseqids = list(pd.read_table("../../data-clean/020_blastx/Trifolium-subterraneum/Nagy_transcriptome_ProteinBlast_Filtered.csv", 
                                 sep = ",",usecols = ["qseqid"])["qseqid"])

## Which transcripts mapped to all 3 closely related species

In [123]:
common_among_all_species = list(set(Metr_Qseqids).intersection(Trpr_Qseqids, Trsu_Qseqids))

## Which transcripts mapped uniquely to each closely related species

In [124]:
Metr_unique_transcripts = list(set(Metr_Qseqids).difference(Trpr_Qseqids, Trsu_Qseqids))
Trpr_unique_transcripts = list(set(Trpr_Qseqids).difference(Metr_Qseqids, Trsu_Qseqids))
Trsu_unique_transcripts = list(set(Trsu_Qseqids).difference(Trpr_Qseqids, Metr_Qseqids))

## Which transcripts never mapped to any closely related species

In [125]:
transcripts_never_hit = list(set(Nagy_transcriptome_ids).difference(Metr_Qseqids, Trpr_Qseqids, Trsu_Qseqids))

## Useful statistics

In [126]:
print(len(common_among_all_species), "transcripts mapped to all 3 closely related species. This represents", round((len(common_among_all_species) / len(Nagy_transcriptome_ids)*100), 2), "% of all transcripts")
print(len(transcripts_never_hit), "transcripts never mapped to any closely related species. This represents", round((len(transcripts_never_hit) / len(Nagy_transcriptome_ids)*100), 2), "% of all transcripts")
print(len(Metr_unique_transcripts), "transcripts mapped only to M. truncatula. This represents", round((len(Metr_unique_transcripts) / len(Nagy_transcriptome_ids)*100), 2), "% of all transcripts")
print(len(Trpr_unique_transcripts), "transcripts mapped only to T. pratense. This represents", round((len(Trpr_unique_transcripts) / len(Nagy_transcriptome_ids)*100), 2), "% of all transcripts")
print(len(Trsu_unique_transcripts), "transcripts mapped only to T. subterraneum. This represents", round((len(Trsu_unique_transcripts) / len(Nagy_transcriptome_ids)*100), 2), "% of all transcripts")

12633 transcripts mapped to all 3 closely related species. This represents 17.66 % of all transcripts
39588 transcripts never mapped to any closely related species. This represents 55.33 % of all transcripts
5960 transcripts mapped only to M. truncatula. This represents 8.33 % of all transcripts
3922 transcripts mapped only to T. pratense. This represents 5.48 % of all transcripts
2016 transcripts mapped only to T. subterraneum. This represents 2.82 % of all transcripts


## Write IDs of mapped transcripts to disk

In [127]:
def write_qseqids(name, qseqid_list):
    with open(name, "w") as f:
        f.write("Qseqid\n")
        for qid in qseqid_list:
            f.write(qid + "\n")
            
write_qseqids("../../data-clean/020_blastx/transcript-hits/common_among_all_species.txt", common_among_all_species)
write_qseqids("../../data-clean/020_blastx/transcript-hits/Metr_unique_transcripts.txt", Metr_unique_transcripts)
write_qseqids("../../data-clean/020_blastx/transcript-hits/Trpr_unique_transcripts.txt", Trpr_unique_transcripts)
write_qseqids("../../data-clean/020_blastx/transcript-hits/Trsu_unique_transcripts.txt", Trsu_unique_transcripts)
write_qseqids("../../data-clean/020_blastx/transcript-hits/transcripts_never_hit.txt", transcripts_never_hit)

## What about transcripts that mapped to the _T. repens_ protein database?

In [112]:
Trre_Qseqids = list(pd.read_table("../../data-clean/020_blastx/Trifolium-repens/Nagy_transcriptome_ProteinBlast_Qseqids.txt",
                                 names = ["Qseqid"])["Qseqid"])

In [115]:
Trre_unique_transcripts = list(set(Trre_Qseqids).difference(Metr_Qseqids, Trpr_Qseqids, Trsu_Qseqids))
len(Trre_unique_transcripts)

2148

## Load in ID of sequences to which the Nagy transcripts hit

In [94]:
Metr_Seqids = list(pd.read_table("../../data-clean/020_blastx/Medicago-truncatula/Nagy_transcriptome_ProteinBlast_Filtered.csv", 
                                 sep = ",",usecols = ["sseqid"])["sseqid"])
Trpr_Seqids = list(pd.read_table("../../data-clean/020_blastx/Trifolium-pratense/Nagy_transcriptome_ProteinBlast_Filtered.csv", 
                                 sep = ",",usecols = ["sseqid"])["sseqid"])
Trsu_Seqids = list(pd.read_table("../../data-clean/020_blastx/Trifolium-subterraneum/Nagy_transcriptome_ProteinBlast_Filtered.csv", 
                                 sep = ",",usecols = ["sseqid"])["sseqid"])

In [97]:
print(len(Metr_Seqids), len(list(set(Metr_Seqids))))

47385 24356


In [98]:
Metr_proteins = list(SeqIO.parse("../../data-raw/021_retrieve-proteins/Medicago-truncatula/M-truncatula_ProteinSeqs.fasta", "fasta"))

In [99]:
len(Metr_proteins)

47385

In [100]:
Metr_blastx = pd.read_table("../../data-clean/020_blastx/Medicago-truncatula/Nagy_transcriptome_ProteinBlast_Filtered.csv", 
                                 sep = ",")

In [103]:
filtered = Metr_blastx.sort_values('length', ascending=False).drop_duplicates(['sseqid'])

In [133]:
species = "Medicago-truncatula"
ProteinBlast_filtered = pd.read_csv("../../data-clean/020_blastx/{0}/Nagy_transcriptome_ProteinBlast_Filtered.csv".format(species), usecols = ["qseqid","sseqid"])

In [140]:
lst = ProteinBlast_filtered.values.tolist()

In [139]:
ProteinBlast_filtered

Unnamed: 0,qseqid,sseqid
0,wcd-682,CBD19748.1
1,wcd-1034,XP_024632994.1
2,wcd-2083,AES73670.1
3,wcd-1502,AES69447.1
4,wcd-842,AES59251.1
5,wcd-1056,XP_024633092.1
6,wcd-4291,AET04209.1
7,wcd-2862,AES73003.1
8,wcd-2606,AES67239.1
9,wcd-2573,AES68070.1


In [146]:
for pair in lst:
    print(pair[1], pair[0])
    break

CBD19748.1 wcd-682
