In [1]:
import datetime
import nltk
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import warnings
import torch
import itertools
import argparse
import shlex
import random
import multiprocessing as mp
from collections import Counter, defaultdict
from inspect import signature
from scipy.stats import ks_2samp, hypergeom, pearsonr, spearmanr
from sklearn.metrics import precision_recall_curve, f1_score, auc
from sklearn.model_selection import train_test_split, KFold
from scipy import spatial, stats
from statsmodels.sandbox.stats.multicomp import multipletests
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation
from gensim.utils import simple_preprocess
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.cluster import AgglomerativeClustering
from nltk.corpus import brown, stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, flatten, to_hms
from oats.utils.utils import function_wrapper_with_duration, remove_duplicates_retain_order
from oats.biology.dataset import Dataset
from oats.biology.groupings import Groupings
from oats.biology.relationships import ProteinInteractions, AnyInteractions
from oats.annotation.ontology import Ontology
from oats.annotation.annotation import annotate_using_noble_coder
from oats.distances import pairwise as pw
from oats.nlp.vocabulary import get_overrepresented_tokens, get_vocab_from_tokens
from oats.nlp.vocabulary import reduce_vocab_connected_components, reduce_vocab_linares_pontes

from _utils import Method
from _utils import IndexedGraph


# Some settings for how data is visualized in the notebook.
mpl.rcParams["figure.dpi"] = 400
warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

ImportError: cannot import name 'IndexedGraph' from '_utils' (/Users/cfy/Desktop/reorganizing-irb-scripts/plant-phenotypes-nlp/notebooks/_utils.py)

In [None]:
# Create and name an output directory according to when the notebooks or script was run.
name = "topic_modeling"
OUTPUT_DIR = os.path.join("../outputs","{}_{}_{}".format(name,datetime.datetime.now().strftime('%m_%d_%Y_h%Hm%Ms%S'),random.randrange(1000,9999)))
os.mkdir(OUTPUT_DIR)

In [None]:
# Paths to different datasets containing gene names, text descriptions, and/or ontology term annotations.
plant_dataset_path = "../../plant-data/genes_texts_annots.csv"
clinvar_dataset_path = "../data/clinvar/clinvar_diseases.csv"
snpedia_snippets_dataset_path = "../data/snpedia/snpedia_snippets.csv"
snpedia_contexts_dataset_path = "../data/snpedia/snpedia_contexts.csv"

# Paths to datasets of sentence or description pairs.
paired_phenotypes_path = "../data/paired_sentences/plants/scored.csv"
biosses_datset_path = "../data/paired_sentences/biosses/cleaned_by_me.csv"

# Paths to files for data about how genes can be grouped into biochemical pathways, etc.
kegg_pathways_path = "../../plant-data/reshaped_data/kegg_pathways.csv" 
plantcyc_pathways_path = "../../plant-data/reshaped_data/plantcyc_pathways.csv" 
lloyd_meinke_subsets_path = "../../plant-data/reshaped_data/lloyd_meinke_subsets.csv" 
lloyd_meinke_classes_path = "../../plant-data/reshaped_data/lloyd_meinke_classes.csv" 

# Paths files that contain mappings from the identifiers used by those groups to full name strings.
kegg_pathways_names_path = "../../plant-data/reshaped_data/kegg_pathways_name_map.csv"
plantcyc_pathways_names_path = "../../plant-data/reshaped_data/plantcyc_pathways_name_map.csv"
lloyd_meinke_subsets_names_path = "../../plant-data/reshaped_data/lloyd_meinke_subsets_name_map.csv"
lloyd_meinke_classes_names_path = "../../plant-data/reshaped_data/lloyd_meinke_classes_name_map.csv"

# Paths to other files including the ortholog edgelist from Panther, and cleaned files from the two papers.
pppn_edgelist_path = "../../plant-data/papers/oellrich_walls_et_al_2015/supplemental_files/13007_2015_53_MOESM9_ESM.txt"
ortholog_file_path = "../../plant-data/databases/panther/PlantGenomeOrthologs_IRB_Modified.txt"
lloyd_function_hierarchy_path = "../../plant-data/papers/lloyd_meinke_2012/versions_cleaned_by_me/192393Table_S1_Final.csv"

In [None]:
# Pathways to text corpora files that are used in this analysis.
background_corpus_filename = "../data/corpus_related_files/untagged_text_corpora/background.txt"
phenotypes_corpus_filename = "../data/corpus_related_files/untagged_text_corpora/phenotypes_all.txt"

In [None]:
# Paths to pretrained or saved models used for embeddings with Word2Vec or Doc2vec.
doc2vec_plants_path = "../models/plants_dbow/doc2vec.model"
doc2vec_wikipedia_path = "../models/enwiki_dbow/doc2vec.bin"
word2vec_plants_path = "../models/plants_sg/word2vec.model"
word2vec_wikipedia_path = "../models/wiki_sg/word2vec.bin"

# Paths to BioBERT models.
biobert_pmc_path = "../models/biobert_v1.0_pmc/pytorch_model"                                  
biobert_pubmed_path = "../models/biobert_v1.0_pubmed/pytorch_model"                                 
biobert_pubmed_pmc_path = "../models/biobert_v1.0_pubmed_pmc/pytorch_model"      

# Word2Vec models availalbe pretrained from Pyysalo et al.
# http://bio.nlplab.org/#doc-tools
# http://evexdb.org/pmresources/vec-space-models/
word2vec_bio_pmc_path = "../models/bio_nlp_lab/PMC-w2v.bin"
word2vec_bio_pubmed_path = "../models/bio_nlp_lab/PubMed-w2v.bin"
word2vec_bio_pubmed_and_pmc_path = "../models/bio_nlp_lab/PubMed-and-PMC-w2v.bin"
word2vec_bio_wikipedia_pubmed_and_pmc_path = "../models/bio_nlp_lab/wikipedia-pubmed-and-PMC-w2v.bin"

<a id="part_7"></a>
# Part 7. Topic Modeling
The purpose of this section is to look at different ways that the embeddings obtained for the dataset of phenotype descriptions can be used to cluster or organize the genes to which those phenotypes are mapped into subgroups or representations. These approaches include generating topic models from the data, and doing agglomerative clustering to find clusters to which each gene belongs.

In [None]:
# Rereading in the datasets used by this section so that it can be run independently of other notebook sections.
dataset = Dataset(plant_dataset_path)
dataset.filter_has_description()
lloyd_meinke_subsets_name_mapping = {row.group_id:row.group_name for row in pd.read_csv(lloyd_meinke_subsets_names_path).itertuples()}
groups = Groupings(lloyd_meinke_subsets_path, lloyd_meinke_subsets_name_mapping)
id_to_group_ids, group_id_to_ids = groups.get_groupings_for_dataset(dataset)

<a id="compare_to_subsets"></a>
### Comparing topics learned by topic modeling to existing categorizations
Topic modelling learns a set of word probability distributions from the dataset of text descriptions, which represent distinct topics which are present in the dataset. Each text description can then be represented as a discrete probability distribution over the learned topics based on the probability that a given piece of text belongs to each particular topics. This is a form of data reduction because a high dimensionsal bag-of-words can be represented as a vector of *k* probabilities where *k* is the number of topics. The main advantages of topic modelling over clustering is that topic modelling provides soft classifications that can be additionally interpreted, rather than hard classifications into a single cluster. Topic models are also explainable, because the word probability distributions for that topic can be used to determine which words are most representative of any given topic. One problem with topic modelling is that is uses the n-grams embeddings to semantic similarity between different words is not accounted for. To help alleviate this, this section uses implementations of some existing algorithms to compress the vocabulary as a preprocessing step based on word distance matrices generated using word embeddings.

Topic models define topics present in a dataset of texts as word or n-gram probability distributions. These models represent each instance of text then as being composed of or generated as as mixture of these topics. The vector for each text that indicates which fraction of that text is generated by a each topic is of length *n* where *n* is the number of topics, and can be used as a reduced dimensionality of the text, with a much smaller vector length than the n-grams embedding itself. Therefore we can build a topic model of the data with 100 topics for example in order to then represent each description in the dataset as a a vector of length 100. This section constructs topic models from the n-gram representations of the dataset and selects different values for the number of topics in order to find a value that works well during the grid search over the training dataset.

In [None]:
# Gene IDs are used in this section, so we want to map gene IDs to fully preprocessed descriptions.
descriptions = dataset.get_description_dictionary()
preprocessed_descriptions = {i:" ".join(preprocess_string(d)) for i,d in descriptions.items()}
texts = list(preprocessed_descriptions.values())

# Basic parameters for this problem that are currently used.
number_of_topics = 42
seed = 0

In [None]:
# Creating and fitting the topic model, either NFM or LDA or something like that.
vectorizer = TfidfVectorizer(max_features=10000, stop_words="english", max_df=0.95, min_df=2, lowercase=True)
features = vectorizer.fit_transform(texts)
cls = NMF(n_components=number_of_topics, random_state=seed)
cls.fit(features)

# Function for retrieving the topic vectors for a list of text descriptions.
def get_topic_embeddings(texts, model, vectorizer):
    ngrams_vectors = vectorizer.transform(texts).toarray()
    topic_vectors = model.transform(ngrams_vectors)
    return(topic_vectors)

# Create the dataframe containing the average score assigned to each topic for the genes from each subset.
group_to_topic_vector = {}
for group_id,ids in group_id_to_ids.items():
    texts = [preprocessed_descriptions[i] for i in ids]
    topic_vectors = get_topic_embeddings(texts, cls, vectorizer)
    mean_topic_vector = np.mean(topic_vectors, axis=0)
    group_to_topic_vector[group_id] = mean_topic_vector
    
# Turning that matrix of weights into a dataframe so it can be worked with.
tm_df = pd.DataFrame(group_to_topic_vector)

# Changing the order of the Lloyd, Meinke phenotype subsets to match other figures and tables for consistency.
lmtm_df = pd.read_csv(lloyd_function_hierarchy_path)    
columns_in_order = [col for col in lmtm_df["Subset Symbol"].values if col in tm_df.columns]
columns_in_order.reverse()
assert len(columns_in_order) == number_of_topics
tm_df = tm_df[columns_in_order]
    
# Reordering so consistency with the curated subsets can be checked by looking at the diagonal.
tm_df["idxmax"] = tm_df.idxmax(axis = 1)
tm_df["idxmax"] = tm_df["idxmax"].apply(lambda x: tm_df.columns.get_loc(x))
tm_df = tm_df.sort_values(by="idxmax")
tm_df.drop(columns=["idxmax"], inplace=True)

# Saving a version of this dataframe this is indexed by topic integers and subset strings, before makings topics a column instead.
topic_subset_similarity_df = tm_df
tm_df = tm_df.reset_index(drop=False).rename({"index":"topic"},axis=1).reset_index(drop=False).rename({"index":"order"},axis=1)

In [None]:
topic_subset_similarity_df.head(10)

In [None]:
tm_df.head(10)

In [None]:
# Describing what the most representative tokens for each topic in the model are.
num_top_words = 5
map_top_words = {}
feature_names = vectorizer.get_feature_names()
for i,topic_vec in enumerate(cls.components_):
    top_words = []
    print(i,end=": ")
    for fid in topic_vec.argsort()[-1:-num_top_words-1:-1]:
        word = feature_names[fid]
        # The next line is applicable if words in the topic model are actually a function of the words in the texts.
        #word = " ".join(unreduce[word])
        top_words.append(word)
        print(word, end=" ")  
    map_top_words[i] = top_words
    print()

In [None]:
# Create a column that specifies what the top tokens for each topic are.
tm_df["tokens"] = tm_df["topic"].map(lambda x: "|".join(map_top_words[x]))

# Move that column to the left for readability before writing to the file.
tokens_col = tm_df.pop("tokens")
tm_df.insert(2, "tokens", tokens_col)

# Renaming the topics to be in order, to be more helpful when preparing figures that are more intuitive.
tm_df["topic_renumbered"] = tm_df["order"].values[::-1]+1
topic_renumbered_col = tm_df.pop("topic_renumbered")
tm_df.insert(2, "topic_renumbered", topic_renumbered_col)

# Remembering a mapping between the topics, their order, and what the renumbered names are.
topic_order_map = {t:i for t,i in zip(tm_df["topic"].values, tm_df["order"].values)}
topic_renumbered_map = {t:i for t,i in zip(tm_df["topic"].values, tm_df["topic_renumbered"].values)}

# Saving this version of the subset and topic similarity data to a file.
tm_df.to_csv(os.path.join(OUTPUT_DIR, "topic_subset_matrix.csv"), index=False)
tm_df.head(10)

In [None]:
# Producing a version of the previous table that is useful for producing line drawings representing these results.
tm_lines_dict = defaultdict(list)


# Remembering the order of the Lloyd, Meinke phenotype subsets to match other figures for consistency.
lmtm_df = pd.read_csv(lloyd_function_hierarchy_path)   
subset_to_class_map = {s:c for s,c in zip(lmtm_df["Subset Symbol"].values, lmtm_df["Class Name"].values)}
subset_to_desc_map = {s:c for s,c in zip(lmtm_df["Subset Symbol"].values, lmtm_df["Subset Name and Description "].values)}
subset_abbrevs_in_order = [col for col in lmtm_df["Subset Symbol"].values if col in tm_df.columns]
subset_abbrevs_in_order.reverse()
subset_order_map = {subset_abbrev:i for i,subset_abbrev in enumerate(subset_abbrevs_in_order)}



# Producing the line entries that represent connections between the subsets and topics.
line_number = 0
topic_int_list = list(topic_subset_similarity_df.columns)
subset_str_list = list(topic_subset_similarity_df.index)
for subset_abbrev, topic_int in itertools.product(topic_int_list,subset_str_list):
    
    # The weight of the line, extracted from the similarity matrix between subsets and topics built previously.
    weight = topic_subset_similarity_df.loc[topic_int,subset_abbrev]
    
    # The strings that should be used to represent classes, subsets, and topics in a figure or plot.
    subset_str = "{} ({})".format(subset_abbrev, subset_to_desc_map[subset_abbrev].lower())
    tm_lines_dict["subset_str"].extend([subset_str,subset_str])
    tm_lines_dict["class_str"].extend([subset_to_class_map[subset_abbrev],subset_to_class_map[subset_abbrev]])
    topic_str = "Topic {}: ({})".format(topic_renumbered_map[topic_int], "|".join(map_top_words[topic_int]))
    tm_lines_dict["topic_str"].extend([topic_str,topic_str])
    
    # Which line is this, they all have individual numbers so that each line can be its own group in a ggplot object.
    tm_lines_dict["line_number"].extend([line_number,line_number])
    tm_lines_dict["weight"].extend([weight,weight])
    
    # Where should the line start and stop? The horizontal values are arbitrary and just have to match.
    # The vertical values are determined by which subset and topic are being connected to each other.
    tm_lines_dict["x"].extend([0,10])
    tm_lines_dict["y"].extend([subset_order_map[subset_abbrev],topic_order_map[topic_int]])
    
    line_number = line_number+1
    
tm_lines_df = pd.DataFrame(tm_lines_dict)
tm_lines_df.to_csv(os.path.join(OUTPUT_DIR, "topic_subset_lines.csv"), index=False)
tm_lines_df.head(50)