In [53]:
import pandas as pd
import regex as re
import spacy
from transformers import *
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
spacy_nlp = spacy.load('en_core_web_sm')
scibert_tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
scibert_model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')

In [43]:
def extract_doctors(string):
    pattern = re.compile(r"Dr.? [A-Z][\w]+ [A-Z][\w]+\b")
    match = re.findall(pattern,string)
    if match:
        doctor = ' '.join(match[0].split(' ')[-2:])
        return doctor
    else: return None

def extract_relevant_keywords(string):
    doc = spacy_nlp(string)
    lst_relevant_keywords = []
    for token in doc:
        if token.pos_ == 'NOUN':
            lst_relevant_keywords.append(token.text)
    return list(set(lst_relevant_keywords))

def create_dict_expertise(dataframe):
    lst_doctors = dataframe['doctors_mentioned'].unique()
    dict_expertise = {}
    for idx in dataframe.index:
        doctor = dataframe.loc[idx,'doctors_mentioned']
        if doctor in dict_expertise:
            dict_expertise[doctor].update(dataframe.loc[idx,'relevant_keywords'])
        else:
            dict_expertise[doctor] = set(dataframe.loc[idx,'relevant_keywords'])
    return dict_expertise

def vectorize_keyword_expertise(dict_expertise):
    new_dict = {}
    for key,values in dict_expertise.items():
        lst_tuples = []
        for keyword in values:
            token_keyword = torch.tensor(scibert_tokenizer.encode(keyword)).unsqueeze(0)
            out = scibert_model(token_keyword)
            word_vector = out[0][0][1]
            tup = (keyword,word_vector)
            lst_tuples.append(tup)
        all_expertise_vectors = np.array([value[1].detach().numpy() for value in lst_tuples])
        mean_expertise_vector = np.mean(all_expertise_vectors,axis=0)
        new_dict[key] = {'average_expertise_vector':mean_expertise_vector,'keywords_expertise':lst_tuples}
    return new_dict

In [55]:
articles_df = pd.read_csv('articles_data.csv',index_col=0)
articles_df['doctors_mentioned'] = articles_df['body'].apply(extract_doctors)
articles_df = articles_df[articles_df["doctors_mentioned"].notnull()]
articles_df = articles_df[:50] #pour éviter que ça ne ralentisse trop
articles_df['relevant_keywords'] = articles_df['keywords'].apply(extract_relevant_keywords)
dict_expertise = create_dict_expertise(articles_df)
dict_expertise_with_average = vectorize_keyword_expertise(dict_expertise)

In [67]:
search_terms = 'climate change caused by chem trails'

token_keyword = torch.tensor(scibert_tokenizer.encode(search_terms)).unsqueeze(0)
out = scibert_model(token_keyword)
embeddings = out[0][0][1:-1]
average_embedding_request = np.mean(embeddings.detach().numpy(),axis=0)

max_cos_similarity = 0
corresponding_key = ''
for key,value in dict_expertise_with_average.items():
    cos_similarity = cosine_similarity(average_embedding_request.reshape(1,-1),value['average_expertise_vector'].reshape(1,-1))[0][0]
    if cos_similarity > max_cos_similarity:
        max_cos_similarity = cos_similarity
        corresponding_key = key
        
print(key, [elem[0] for elem in dict_expertise_with_average[key]['keywords_expertise']])
    
    

Anthony Fauci ['animals', 'COVID-19', 'host', 'reservoir', 'need', 'press', 'virus', 'president', 'humans', 'people', 'infection', 'origins', 'vaccine', 'diseases', 'experts', 'report', 'vaccinations', 'outlaw', 'coronavirus', 'secretary', 'transmission']
