In [1]:
import datetime
import nltk
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import random
import warnings
from collections import defaultdict
from nltk.corpus import brown
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string, remove_stopwords
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from statsmodels.sandbox.stats.multicomp import multipletests
from itertools import product

sys.path.append("../../oats")
sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, flatten, to_hms
from oats.utils.utils import function_wrapper_with_duration, remove_duplicates_retain_order
from oats.biology.dataset import Dataset
from oats.biology.groupings import Groupings
from oats.biology.relationships import ProteinInteractions, AnyInteractions
from oats.annotation.ontology import Ontology
from oats.annotation.annotation import annotate_using_noble_coder, term_enrichment
from oats.distances import pairwise as pw
from oats.nlp.vocabulary import get_overrepresented_tokens, get_vocab_from_tokens
from oats.nlp.vocabulary import reduce_vocab_connected_components, reduce_vocab_linares_pontes, token_enrichment

warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
nltk.download('punkt', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

Warming up PyWSD (takes ~10 secs)... took 4.464608669281006 secs.


True

In [2]:
# Paths to the files that are used for this notebook.
dataset_path = "../../quoats/data/genes_texts_annots.csv"
dataset = Dataset(dataset_path, keep_ids=True)
dataset.describe()

Unnamed: 0,species,unique_gene_identifiers,unique_descriptions
0,ath,5850,3493
1,gmx,30,23
2,mtr,37,36
3,osa,92,85
4,sly,69,69
5,zma,1405,810
6,total,7483,4516


In [3]:
#os.chdir('/Users/irbraun/phenologs-with-oats/quoats')
mapping = dataset.get_species_to_name_to_ids_dictionary(include_synonyms=False, lowercase=True)
genes = pd.read_csv("autophagy_core_genes.csv")
genes["id"] = genes.apply(lambda x: mapping[x["species_code"]].get(x["identifier"].strip().lower(),-1), axis=1)
genes[genes["id"]!=-1]["id"] = genes[genes["id"]!=-1]["id"].map(lambda x: x[0])
genes["in_current_dataset"] = genes["id"].map(lambda x: x!=-1)
genes

# Looking just at the ones that are in the current dataset.
genes = genes[genes["in_current_dataset"]]
genes["id"] = genes["id"].map(lambda x: x[0])
genes

Unnamed: 0,species,species_code,name,identifier,id,in_current_dataset
1,Arabidopsis,ath,ATG2,AT3G19190,23,True
3,Arabidopsis,ath,ATG4,AT2G44140,4688,True
4,Arabidopsis,ath,ATG5,AT5G17290,462,True
5,Arabidopsis,ath,ATG6,AT3G61710,297,True
6,Arabidopsis,ath,ATG7,AT5G45900,5585,True
8,Arabidopsis,ath,ATG9,AT2G31260,533,True
9,Arabidopsis,ath,ATG10,AT3G07525,2377,True
10,Arabidopsis,ath,ATG11,AT4G30790,5272,True
11,Arabidopsis,ath,ATG12,AT1G54210,4274,True
12,Arabidopsis,ath,ATG13,AT3G49590,4968,True


In [4]:
# Grabbing the texts dictionary from the dataset that we can use to grab the descriptions to query.
texts = dataset.get_description_dictionary()
texts[4688]

'Small plants and premature senescence under normal soil-grown conditions. Hypersensitivity to N and fixed-C deprivation.'

In [5]:
# Prepare dictionaries to hold the resulting arrays.
resulting_bin_arrays = defaultdict(dict)
resulting_bin_arrays["ath"]["ath"] = []
resulting_bin_arrays

defaultdict(dict, {'ath': {'ath': []}})

In [6]:
# The searches within the same species.
rank_for_not_found = 100
bins =[0,11,21,31,41,51,rank_for_not_found]
bin_names = [10,20,30,40,50,rank_for_not_found]
assert len(bin_names) == len(bins)-1

ctr = 0
for gene in genes.itertuples():
    
    ctr = ctr+1
    limit = 50
    species = gene[1]
    species_code = gene[2]
    identifier = gene[3]
    gene_id = gene[5]
    text = texts[gene_id]
    
    # Because these are being passed as strings to the command line, quotes need to be removed now,
    # instead of waiting for them to be removed as a preprocessing step of the search strings in the streamlit script.
    text = text.replace("'","")
    text = text.replace('"','')
    print("Here!")
   
    path = "../plant-phenotypes-nlp/quoats/outputs_within_autophagy/output_{}.tsv".format(ctr)
    os.chdir('../../quoats')
    
    os.system("python main.py -s {} -t identifiers -q '{}:{}' -l {} -o {} -r 0.000 -a TFIDF".format(species,species,identifier,limit,path))
    time.sleep(4)
    
    if os.path.exists(path):
        print("Yeah!")
 
        df = pd.read_csv(path, sep='\t')
        df = df[["Rank","Internal ID"]]
        df = df.drop_duplicates()
        id_to_rank = dict(zip(df["Internal ID"].values,df["Rank"].values))
        assert rank_for_not_found > limit


        # For within the same species, get rid of the identical gene (always rank 1).
        ids_of_interest = [i for i in genes[genes["species"]==species]["id"].values if i != gene_id]
        ranks = [id_to_rank.get(i, rank_for_not_found) for i in ids_of_interest]
        resulting_bin_arrays[species_code][species_code].append(np.histogram(ranks, bins=bins)[0])

        print(ranks)
        print("done with {} queries".format(ctr))
        os.chdir("../plant-phenotypes-nlp/quoats/")
        
print('done with all queries')

Here!
Yeah!
[100, 100, 100, 100, 100, 100, 100, 100, 100]
done with 1 queries
Here!
Yeah!
[100, 100, 100, 100, 100, 100, 100, 100, 100]
done with 2 queries
Here!
Yeah!
[100, 100, 100, 100, 100, 13, 100, 100, 100]
done with 3 queries
Here!
Here!


FileNotFoundError: [Errno 2] No such file or directory: '../../quoats'

In [None]:
# Create the output dataframe with the means and standard deviation for each bin and direction.
output_rows = []
s1 = "ath"
s2 = "ath"
means = np.mean(np.array(resulting_bin_arrays[s1][s2]),axis=0)
std_devs = np.std(np.array(resulting_bin_arrays[s1][s2]),axis=0)
for i in range(len(bin_names)):
    output_rows.append([s1, s2, bin_names[i], means[i], std_devs[i]])    
names = ["from","to","bin","mean","sd"]    
output_df = pd.DataFrame(output_rows,columns=names)
output_df

In [None]:
os.chdir('../plant-phenotypes-nlp/quoats')
output_path = "plots/autophagy_plot_data.csv"
output_df.to_csv(output_path, index=False)