In [None]:
import requests
import json
import nferx_py.fn as nf
import matplotlib.pyplot as plt
import pandas as pd
from multiprocessing import pool
import seaborn as sns
from tqdm import tqdm
from nltk.stem import SnowballStemmer
from gensim.utils import simple_preprocess
import numpy as np
from tqdm.notebook import tqdm
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing
from scipy.spatial import distance
from sklearn.metrics import auc

In [None]:
stemmer = SnowballStemmer('english')

In [None]:
nf.authenticate('nfer','6eaa1d27bfa0639f2712191fa55df872')
nf.modify_defaults('server', 'preview')
nf.modify_defaults('api_server', 'preview')
AUTH = nf.AUTH

### notes
1. Get local scores for all gene/cell combinations from single cell api for Tabula muris (study 2) and mouse cell atlas (study 15)
2. Do "intra-study" testing: for each Tabula Muris cluster, (1) identify the subset of genes to consider (e.g. top 0.05% by cohen's D vs. all other cells), (2) calculate cosine similarity between mean CP10K vector of this gene subset and the literature score vectors for all Tabula Muris cell types with this same gene subset. This would give us a ranked list of literature-derived labels for each cluster. Then repeat this with Mouse Cell Atlas. ---- Cosine sim did not work out---- Switched to calculating vector norms of literature scores of top genes from selected from cluster A, accross all clusters 

- For each cluster to label C:
    1. Select the top n genes by Cohen’s D (e.g. top 5, 10, 25)
    2. From API, get the fraction of cells in cluster C which express each gene
    3. From literature reference table, get the local scores between each reference cell type and the top n genes
    4. Multiply “Fraction cells expressing in cluster C” column by each “Ref_CellTypeX_LocalScores” column to get a set of “literature encoded expression vectors” for each reference cell type
    5. For each “Lit Encoded Vector” column, calculate L0, L1, L2, and L-inf norms
        L0: number of non-zero elements
        L1: sum of all values
        L2: square root of sum of squares
        L-inf: max value
    Output: four tables of Clusters To Label (rows) * Reference Cell Types 
    Table 1 = L0 norm, Table 2 = L1 norm, Table 3 = L2 norm, Table 4 = L-inf norm
    
-----------
Update: 05/15

use global scores

Few steps to increase AUC: remove non-annotated clusters; try combining global /local
-------------step 4 yet to be done------------------------
4. Do "inter-study" testing: for each Tabula Muris cluster, (1) identify the subset of genes to consider (e.g. top 0.05% by cohen's D vs. all other cells), (2) calculate cosine similarity between mean CP10K vector of this gene subset and the literature score vectors for all Mouse Cell Atlas cell types with this same gene subset. Then repeat this to label all Mouse Cell Atlas clusters using the literature vectors from the Tabula Muris dataset. This will require some handling of synonyms/cell "families" which I provided to Katie for this analysis ... but we could go over that later since step 2 alone will take some time to set up and optimize I think

### POST to single cell api to get the table

In [None]:
URL = 'https://pre-staging.nferx.com/singlecellapi/study2/summary?'
headers = {'content-type' : 'application/json'}
res = requests.post(url = URL, params = {'page': 1, 'rows': 3412880}, auth = AUTH)
data_all = pd.DataFrame(res.json()['result']['data'])
# d.drop(columns=['localScoreCellType', 'localScoreTissueType', 'greaterThanZero'], inplace = True)
data_all_backup = data_all.copy(deep = True)
# data_all.drop(columns=['localScoreTissueType', 'greaterThanZero'], inplace = True)


### litreature vectors

In [None]:
data_all = data_all.loc[~(data_all.cellType == 'Not Annotated'),].reset_index(drop = True)

In [6]:
class Literature_model():

    def __init__(self, score_mat_normalized, score_mat_true,
                 data = data_all, savepath = './data/results/global_score/', use_score = ''):

        self.gene_cluster_score_norm = score_mat_normalized
        self.gene_cluster_score_true = score_mat_true
        self.savepath = savepath
        self.dfs_all = dict() 
        
    def get_top_genes_vector(self, df, top_n = None, quantile = 0.995, 
                             cols = ['cohenD', 'mean', 'greaterThanZero']):
        
        #matrix 1: top-genes means (shape: 1*N)
        df = df.set_index('geneName')
        if top_n:
            return df.sort_values(by = 'cohenD', ascending = False)[cols].iloc[0:top_n]

        cutoff = df.cohenD.quantile([0.995]).values[0]
        top_genes_vec = df.loc[df.cohenD >= cutoff, cols]


        return top_genes_vec
    
    def get_measure_matrix(self, top_genes_subset, metric = 'cosine'):
        cluster = top_genes_subset.cluster[0]
        #matrix 1: top-genes means (shape: 1*N)
        top_genes_mean = top_genes_subset.loc[:, ['geneName', 'mean']].set_index('geneName').transpose()
        #matrix 2: top-genes(N) vs all - clusters'(M) local score (shape: N*M)
        gene_local_score_subset = self.gene_cluster_score_norm.loc[top_genes_mean.columns,:]
        lit_encoded_subset_true = self.gene_cluster_score_true.loc[top_genes_mean.columns,:]
        #get literature vector
        lit_encoded_subset = gene_local_score_subset.multiply(top_genes_subset.loc[:, 'greaterThanZero'].values, 
                                                                  axis = 0)

        #apply norms

        if metric == 'norm-all':
            score0 = (lit_encoded_subset_true != 0).sum()
#             print(score0.shape, score0)
            score1 = lit_encoded_subset.sum()
            score2 = lit_encoded_subset.pow(2).sum().pow(1/2)
            scoreINF = lit_encoded_subset.max()

            return [score0, score1, score2, scoreINF]

        elif metric == 'cosine':
            assert top_genes_mean.shape[1] == lit_encoded_subset.shape[0]
            score = cosine_similarity(top_genes_mean.to_numpy(), lit_encoded_subset.transpose())
            score = pd.Series(score.reshape(-1), index = lit_encoded_subset.columns)
        elif metric == 'l0':
            score = (lit_encoded_subset != 0).sum()
        elif metric == 'l1':
            score = lit_encoded_subset.sum()
        elif metric == 'l2':
            score = lit_encoded_subset.pow(2).sum().pow(1/2)
        elif metric == 'inf':
            score = lit_encoded_subset.max()
#         elif metric == 'euc':
#             score = distance.euclidean(top_genes_mean.to_numpy(), lit_encoded_subset.transpose())
#             score = pd.Series(score.reshape(-1), index = lit_encoded_subset.columns)
        return score
    
    
    #top n genes; all norrms
    def use_measure(self, n_genes = [5, 10, 25], metric = 'norm-all', 
                    save = False):
        for n in n_genes:
            print('Processing : %d genes'%n)
            cluster_mean_vectors = data_all.groupby('cluster').apply(lambda x:
                                        self.get_top_genes_vector(x, top_n = n))
            cluster_mean_vectors.loc[:, 'greaterThanZero'] = cluster_mean_vectors.loc[:, 'greaterThanZero']/100
            self.cluster_mean_vectors = cluster_mean_vectors.reset_index()
            res = self.cluster_mean_vectors.groupby('cluster').apply(lambda x: self.get_measure_matrix(x, metric))
            if metric == 'norm-all':
                norms = ['L0', 'L1', 'L2', 'Linf']
                for i, norm in enumerate(norms):
                    name = '{}_{:02}'.format(norm, n)
                    df = res.apply(lambda x: x[i])
                    self.dfs_all[name] = df
                    if save:
                        df.index = df.index + '_TRUE'
                        df.to_csv(self.savepath+'{}_{:02}_genes.csv'.format(norm, n))
            else:
                name = '{}_{:02}'.format(metric, n)
                self.dfs_all[name] = res
                if save:
                    res.index = res.index + '_TRUE'
                    res.to_csv(self.savepath+'{}_{:02}_genes.csv'.format(metric, n))
        return res

    
    def run(self, metric = ['norm-all']):
        
        _ = list(map(lambda x: self.use_measure(metric = x), metric))
        self.get_ranks()
        
    def get_ranks(self):
        
        metric_rank_df = pd.DataFrame(range(0,96), columns=['rank'])
        #mappings frorm pat
        cell_maps = pd.read_csv('./data/TM_clusters_map.csv')
        cell_maps.set_index('cluster', inplace=True)
        self.idx_true = self.dfs_all[list(self.dfs_all.keys())[0]].index
        for key, df in self.dfs_all.items():
#             df = pd.read_csv(self.savepath+file, index_col = 0)
            
            df.columns = df.index = df.columns.str.replace(';|-', '.') 
            df.columns = df.index = cell_maps.loc[df.columns, 'map'] 
            ranks_all = df.T.reset_index().drop_duplicates().set_index('map').apply(lambda x: 
                                                        x.sort_values(ascending = False).index, 
                                                        axis = 0).reset_index(drop = True).apply(lambda x:
                                                        x[x.name == x].index.to_list())
            cluster_ranks = ranks_all.apply(lambda x:x[0])
            metric_rank_df[key] = metric_rank_df.loc[:,'rank'].apply(lambda x: 
                                                                               (cluster_ranks < x).sum()/142)


        metric_rank_df['rank'] = metric_rank_df['rank']/95
        var_name_ = 'Measure_Top N genes'
        self.plot_df = metric_rank_df.melt(id_vars = 'rank', 
                                           var_name = var_name_,
                                           value_name='n_top_rank_clusters')
        
        self.plot_df['norm'] = list(map(lambda x: x[0], self.plot_df[var_name_].str.split('_')))
        self.plot_df['n_top_genes'] = list(map(lambda x: str(x[1]), self.plot_df[var_name_].str.split('_')))
        self.plot_df['n_top_genes'] = self.plot_df.n_top_genes.astype('category')
        
        auc_ = self.plot_df.groupby(['norm', 
                                'n_top_genes']).apply(lambda x: 
                                    auc(x['rank'], x['n_top_rank_clusters'])).to_frame(name = 'auc').reset_index()

        self.plot_df = pd.merge(self.plot_df, auc_, how = 'left', on = ['norm', 'n_top_genes'])
        self.plot_df[var_name_ + ' | AUC'] = self.plot_df[var_name_] +  ' | ' + self.plot_df['auc'].round(3).astype(str)
    
            
#     files = os.listdir(self.savepath)
    #investigating low perfonming clusters
    def get_ranks_for_df(self, df):
#         df = pd.read_csv(filename, index_col = 0)
        idx_true = self.idx_true
        idx_true.name = 'cluster'
#         df.columns = df.index = df.columns.str.replace(';|-', '.') 
#         df.columns = df.index = cell_maps.loc[df.columns, 'map'] 

        ranks_all = df.T.reset_index().drop_duplicates().set_index('map').apply(lambda x: 
                                                                              x.sort_values(ascending = False).index, 
                                     axis = 0).reset_index(drop = True).apply(lambda x:
                                                                              x[x.name == x].index.to_list())

        ranks_all = ranks_all.apply(lambda x: x[0]).to_frame(name = 'rank')
        ranks_all.index = idx_true
        
        return ranks_all, df

In [None]:
def get_score_normalized(data, use_score = 'localScore'):
    data['Score'] = [d[use_score] for d in data.localScoreCellType]
    data['cluster'] = data.cellType.str.replace(' ', '_') + ';'+data.tissueType.str.replace(' ', '_')
    df = data.pivot(index = 'geneName', columns = 'cluster', values = 'Score')
    normalized_df = (df-df.mean())/df.std()
    gene_cluster_score = pd.DataFrame(normalized_df,
                                columns=df.columns, 
                                index = df.index)
    return gene_cluster_score, df

In [None]:
gene_cluster_local_score_norm, gene_cluster_local_score = get_score_normalized(data_all)

In [None]:
%%time
c_local = Literature_model(data = data_all,score_mat_true = gene_cluster_local_score,
                           score_mat_normalized=gene_cluster_local_score_norm, use_score='localScore')
c_local.run()

In [None]:
gene_cluster_global_score_norm, gene_cluster_global_score = get_score_normalized(data_all, use_score = 'globalScore')

In [None]:
%%time
c_global = Literature_model(data = data_all,score_mat_true = gene_cluster_global_score,
                           score_mat_normalized=gene_cluster_global_score_norm, use_score='localScore')
c_global.run()

### try to fit a model to see how different norms are performing? idk..

In [None]:
def get_ranks_for_df(df, true_idx):
#         df = pd.read_csv(filename, index_col = 0)
    idx_true = true_idx
    idx_true.name = 'cluster'
#         df.columns = df.index = df.columns.str.replace(';|-', '.') 
#         df.columns = df.index = cell_maps.loc[df.columns, 'map'] 

    ranks_all = df.T.reset_index().drop_duplicates().set_index('map').apply(lambda x: 
                                                                          x.sort_values(ascending = False).index, 
                                 axis = 0).reset_index(drop = True).apply(lambda x:
                                                                          x[x.name == x].index.to_list())

    ranks_all = ranks_all.apply(lambda x: x[0]).to_frame(name = 'rank')
    ranks_all.index = idx_true

    return ranks_all, df

In [None]:
def get_all_ranks(dfs, true_idx):
    ranks_all = pd.DataFrame(0, columns = dfs.keys(), index = true_idx)
    for key, df in dfs.items():
        ranks_all.loc[:, key], _ = get_ranks_for_df(df, true_idx)
        
    return ranks_all

In [None]:
ranks_all_global = get_all_ranks(c_global.dfs_all, c_global.idx_true)

In [None]:
ranks_all_global.to_csv('./data/rank_all_norms_by_globalScore.csv')

In [None]:
cell_maps = pd.read_csv('./data/TM_clusters_map.csv')
cell_maps.set_index('cluster', inplace=True)

In [None]:
def get_ranks(dfs_all, score_type, ranks = False):
    metric_rank_df = pd.DataFrame(range(0,96), columns=['rank'])
    for key, df in dfs_all.items():
        ranks_all = df.T.reset_index().drop_duplicates().set_index('map').apply(lambda x: 
                                                    x.sort_values(ascending = False).index, 
                                                    axis = 0).reset_index(drop = True).apply(lambda x:
                                                    x[x.name == x].index.to_list())
        cluster_ranks = ranks_all.apply(lambda x:x[0]).to_frame(name = 'rank')
        if ranks:
            cluster_ranks['Measure_top N genes'] = key
            cluster_ranks['score'] = score_type
            cluster_ranks.reset_index(inplace = True)
            return cluster_ranks
        metric_rank_df[key] = metric_rank_df.loc[:,'rank'].apply(lambda x: 
                                        (cluster_ranks < x).sum()/142)
        
        metric_rank_df['rank'] = metric_rank_df['rank']/95
        var_name_ = 'Measure_Top N genes'
        plot_df = metric_rank_df.melt(id_vars = 'rank', 
                                           var_name = var_name_,
                                           value_name='n_top_rank_clusters')
        
        plot_df['norm'] = list(map(lambda x: x[0], plot_df[var_name_].str.split('_')))
        plot_df['n_top_genes'] = list(map(lambda x: str(x[1]), plot_df[var_name_].str.split('_')))
        plot_df['n_top_genes'] = plot_df.n_top_genes.astype('category')
        
        auc_ = plot_df.groupby(['norm', 
                                'n_top_genes']).apply(lambda x: 
                                    auc(x['rank'], x['n_top_rank_clusters'])).to_frame(name = 'auc').reset_index()

        plot_df = pd.merge(plot_df, auc_, how = 'left', on = ['norm', 'n_top_genes'])
        plot_df[var_name_ + ' | AUC'] = plot_df[var_name_] +  ' | ' + plot_df['auc'].round(3).astype(str)
    
    return plot_df

In [None]:
norm_local = pd.DataFrame(normalize(c_local.dfs_all['L1_25'], axis = 1), 
                          columns= c_local.dfs_all['L1_25'].columns,
                          index = c_local.dfs_all['L1_25'].index)

norm_global = pd.DataFrame(normalize(c_global.dfs_all['L1_25'], axis = 1), 
                          columns= c_global.dfs_all['L1_25'].columns,
                          index = c_global.dfs_all['L1_25'].index)

In [None]:
norm_all = (0.0*c_local.dfs_all['L1_25'] + 1.0*c_global.dfs_all['L1_25']).copy(deep = True)

In [None]:
ranks_norm_sum = get_ranks({'L1_25': norm_all}, 'norm')

In [None]:
c_global.plot_df[c_global.plot_df['Measure_Top N genes'] == 'L1_25']

In [None]:
ranks_norm_sum

In [None]:
c_local.dfs_all['L1_05'].T.reset_index().drop_duplicates()

In [None]:
ranks_norm_sum

In [None]:
ranks_norm_sum['map'] = c_local.gene_cluster_score.columns

In [None]:
ranks_norm_sum.sort_values(by = 'rank', ascending = False).to_csv('./data/ranks_normalized.csv')

In [None]:
ranks_local = get_ranks(norm_global, 'local')

In [None]:
ranks_global = get_ranks(c_global.dfs_all, 'global')

In [None]:
r = ranks_local.reset_index().melt(id_vars = ['map', 'Measure_top N genes', 'score'],
                                   value_vars = 'rank', value_name = 'rank')

In [None]:
ranks_all = pd.merge(ranks_local, ranks_global, on = ['map', 
                                                'Measure_top N genes'], suffixes=['_local', '_global'])

In [None]:
ranks_all['Measure_top N genes'].value_counts()

In [None]:
import plotly.express as px
import plotly.graph_objects as go

fig = px.scatter(ranks_all.loc[ranks_all['Measure_top N genes'] == 'L1_25'],
                 x='rank_local', y='rank_global', hover_data=['map'])
fig.update_traces(textposition='top center')
fig.update_layout(height = 800,
    title_text='Literature ranks loval vs global'
)
fig.show()

In [None]:
fig.write_html('./plots/lit_local_vs_global_ranks.html')

In [None]:
ranks_norm_sum

In [None]:
sns.set()
var_name_ =  'Measure_Top N genes | AUC'
fig = plt.figure(figsize=(14,10))

ax = sns.lineplot(data = ranks_norm_sum, 
             x = 'rank', 
             y = 'n_top_rank_clusters')
#              hue=var_name_, hue_order=sorted(ranks_norm_sum[var_name_].unique()),
#              palette=sns.color_palette("Paired", 15))
# plt.setp(ax.get_legend().get_texts(), fontsize='15') # for legend text
# plt.setp(ax.get_legend().get_title(), fontsize='20')
ax.axes.set_title("Literature Based Cluster Predictions (Tabula Muris; Global Score)",fontsize=25)
ax.set_xlabel("Rank Threshold",fontsize=20)
ax.set_ylabel("Fractions of Clusters Labeled Correctly",fontsize=20)

In [None]:
sns.set()
def plot_auc(df, score):
    var_name_ =  'Measure_Top N genes | AUC'
    fig = plt.figure(figsize=(14,10))

    ax = sns.lineplot(data = df, 
                 x = 'rank', 
                 y = 'n_top_rank_clusters',
                 hue=var_name_, hue_order=sorted(df[var_name_].unique()),
                 palette=sns.color_palette("Paired", 12))
    plt.setp(ax.get_legend().get_texts(), fontsize='15') # for legend text
    plt.setp(ax.get_legend().get_title(), fontsize='20')
    ax.axes.set_title("Literature Based Cluster Predictions (Tabula Muris; %s Score)"%score,fontsize=25)
    ax.set_xlabel("Rank Threshold",fontsize=20)
    ax.set_ylabel("Fractions of Clusters Labeled Correctly",fontsize=20)

In [None]:
plot_auc(c_local.plot_df, score = 'Local')

In [None]:
plt.savefig('./plots/lit_prredictions_auc.png')

In [None]:
plot_auc(c_global.plot_df, score = 'Global')

### some exploration

In [None]:
cluster_mean_vectors = data_all.groupby('cluster').apply(lambda x: get_top_genes_vector(x, top_n = 5))

In [None]:
cluster_mean_vectors

In [None]:
# lc.columns = lc.columns.str.replace(';|-', '.') 

# rank_df.to_csv('./data/ranks_l1_25_global_score.csv')

# l = df.T.reset_index().drop_duplicates().set_index('map').apply(lambda x: 
#                                                                           x.sort_values(ascending = False).index, 
#                                  axis = 0).reset_index(drop = True).iloc[0:9, -1].to_list()

# df.T.reset_index().drop_duplicates().set_index('map').apply(lambda x: 
#                                                                           x.sort_values(ascending = False).index, 
#                                  axis = 0).reset_index(drop = True)

# cell_maps.loc[cell_maps['map'].isin(l)]
# lc = pd.read_csv('./data/local_score_top_25_type_II_pneumocytes.csv', index_col = 0)

### Preevious 

In [None]:
cluster_mean_vectors.loc[slice('Alveolar_macrophages;lung')]

In [None]:
collection =  'patrick-cells-pl-200304'
n = 10
def query_lab(gene_list, n=n, collection=collection, window = 101):
    result_df = pd.DataFrame()
    for gene in tqdm(gene_list[0:n]): 
        query = gene
        res = nf.get_signals_lab(query = query, control_collection = collection, 
                                 window = window)

        if type(res)!= str:
            res = pd.DataFrame(res)
            res['gene'] = gene
            result_df = pd.concat([result_df, res])
        #res.token = res.token.str.upper()
     
    return result_df.pivot(index = 'gene', columns = 'token', values = 'score').astype('float32')

In [None]:
df_test = top_genes.iloc[0:1].apply(query_lab)

In [None]:
df_all = top_genes.apply(query_lab)

In [None]:
syns_all  =  set([tok for ls in syns for tok in ls])
def sort_df(df, q = [0.3, 0.5, 0.75], by = [0.75, 0.5, 0.3]):
    df_collapsed = pd.DataFrame()
    df = df.T
    no_match = df.index.difference(syns_all)
#     print(no_match)
    df.insert(loc = 0, column = 'count', value = n - df.isna().sum(1))
    for i, syn in tqdm(enumerate(syns)):
        df_subset = df[df.index.isin(syn)]
        if not df_subset.empty:
#             print(i)
#             print('\n',df_subset.index)
            
            df_idx = df_subset['count'].idxmax()
            df_max = df_subset.max()
            df_max.name = df_idx
            df_collapsed = pd.concat([df_collapsed, df_max], axis=1)
    if not df.loc[no_match,:].empty:
        df_collapsed = pd.concat([df_collapsed, df.loc[no_match,:].T], axis = 1)
    quantiles = df_collapsed.quantile(q, axis = 0)
    df_collapsed = pd.concat([quantiles, df_collapsed]).T
    df_collapsed = df_collapsed.sort_values(by = by, ascending = False)
    
    return df_collapsed.drop_duplicates()

In [None]:
df_all[0].T

In [None]:
sorted_dfs = list(map(sort_df, df_all))

In [None]:
z = df_all[0].columns.difference(syns_all)

In [None]:
df_all[0].loc[:,'yolk_sac_derived_macrophages']

In [None]:
lit_df = pd.DataFrame()
for i, idx in enumerate(top_genes.index):
    df = sorted_dfs[i].copy(deep = True)
#     df.insert(loc = 0, column = 'count', value = n - df.isna().sum(1))
    df.insert(loc = 0, column = 'tissue', value = idx[0])
    df.insert(loc = 0, column = 'true_cell', value = idx[1])
    df = df.reset_index(drop = False, inplace = False)
    df.columns = ['token'] + df.columns[1:].to_list()
#     df = df[df['count'] > 3]
    lit_df = pd.concat([lit_df, df.loc[0:10, ['token', 'true_cell', 'tissue', 'count']]])

In [None]:
lit_df.to_csv('./data/t10_genes_per_cluster.csv')

In [None]:
lit_df

In [None]:
ees_tokens = pd.read_csv('./data/appendTokens.csv', sep = '\t', header=None)

In [None]:
ees_tokens.columns = ['token', 'syn']

In [None]:
synonyms = ees_tokens.groupby('token').apply(lambda x: x.syn.to_list())

In [None]:
all_syns = [set([idx]+syn) for idx, syn in synonyms.iteritems()]

In [None]:
tokens = sorted_dfs[0].index
syn_mat = pd.DataFrame(np.eye(len(tokens)), index=idx, columns=idx)

In [None]:
ees_ = ees_tokens.copy(deep = True)
ees_['val'] = 1
ees_1 = pd.pivot_table(ees_, index='token', columns='syn', values = 'val').fillna(0)
ees_2 = pd.pivot_table(ees_, index='syn', columns='token', values = 'val').fillna(0)

In [None]:
ees_1.index.intersection(ees_2.index)

In [None]:
len(all_syns)

In [None]:
subset = {i: [] for i in range(len(all_syns))}
duplicates = {i: [] for i in range(len(all_syns))}
for i, syn_set in enumerate(all_syns):
    for j, syn_set_ in enumerate(all_syns):
        if (i!=j) & (len(syn_set.intersection(syn_set_)) > 0):
            print('\n',i, j, '\nIntersection: ', syn_set.intersection(syn_set_))
            if syn_set_ == syn_set:
                duplicates[i].append(j)
            else:
                subset[i].append(j)
                
                

In [None]:
subset

In [None]:
syns = []
for k, v in subset.items():
    if v:
        syn_subset = synonyms.iloc[[k]+v]
        print(syn_subset.index)
        s = list(set(syn_subset.explode().to_list() + syn_subset.index.to_list()))
        if s not in syns:
            syns.append(s)
    else:
        syns.append(list(set(synonyms.iloc[k] + [synonyms.index[k]])))

In [None]:
syns

In [None]:
d[d.index.isin(syns[1])]