In [None]:
'''
Launch cell to collect a database of 1100 articles categorized as 'science' from 
(more or less) reliable sources and save it to a dataframe
'''
import requests
import pandas as pd

def make_call(cursor):
    headers = {'X-Application-Id' : 'XXX',
               'X-Application-Key': 'XXX'
                }

# application-id: 789284d + la lettre a ajoutée à la fin
# clé: c819ebf4b24cd9bf8c63ad754130a6a + la lettre c ajoutée à la fin

    params = {
            'language':  ['en'],
            'body' : ' Dr.',
            'published_at.start' : '2021-01-15T00:00:00Z',
            'published_at.end' : '2021-04-07T00:00:00Z',
            'categories.taxonomy' : 'iptc-subjectcode',
            'categories.id':'13000000', #corresponds to the 'science' category
            'source_rankings_alexa_rank_min': 300,
            #'source.name' : ['cnn','bbc'],
            'social_shares_count_linkedin_min': 10000,
            'source_links_in_count_min': 10000,
            'cursor' : cursor,
            'per_page' : 100
            }

    url = 'https://api.aylien.com/news/stories'
    r = requests.get(url,headers=headers,params=params)
    return r.json()

#initialize call
response = make_call('*')
next_page = response["next_page_cursor"]
all_stories = response["stories"]
lst_sources = []

#iterate through chunks of 100 articles 
for page in range(10):
    response = make_call(next_page)
    next_page = response["next_page_cursor"]
    stories = response["stories"]
    all_stories += stories

#select only data of preliminary interest and append it to dataframe
df = pd.DataFrame(columns = ['body','title','source','publication_date','words_count','hashtags','keywords'])
for story in all_stories:
    mini_dict = {'body':story['body'],
                'title':story['title'],
                 'source':story['source']['name'],
                 'publication_date':story['published_at'],
                 'words_count':story['words_count'],
                 'hashtags': str(story['hashtags']).replace('#','').replace("\'",'').replace('[','').replace(']',''),
                 'keywords': str(story['keywords']).replace("\'",'').replace('[','').replace(']','')
                }

    df = df.append(mini_dict,ignore_index=True)

df.to_csv('articles_data.csv')

In [None]:
#!pip3 install spacy
#!python3 -m spacy download en_core_web_sm
#!pip3 install transformers

In [None]:
import pandas as pd
import regex as re
import spacy
from transformers import *
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
spacy_nlp = spacy.load('en_core_web_sm')
scibert_tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
scibert_model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')

In [None]:
def extract_doctors(string):
    pattern = re.compile(r"Dr.? [A-Z][\w]+ [A-Z][\w]+\b")
    match = re.findall(pattern,string)
    if match:
        doctor = ' '.join(match[0].split(' ')[-2:])
        return doctor
    else: return None

def extract_relevant_keywords(string):
    doc = spacy_nlp(string)
    lst_relevant_keywords = []
    for token in doc:
        if token.pos_ == 'NOUN':
            lst_relevant_keywords.append(token.text)
    return list(set(lst_relevant_keywords))

def create_dict_expertise(dataframe):
    lst_doctors = dataframe['doctors_mentioned'].unique()
    dict_expertise = {}
    for idx in dataframe.index:
        doctor = dataframe.loc[idx,'doctors_mentioned']
        if doctor in dict_expertise:
            dict_expertise[doctor].update(dataframe.loc[idx,'relevant_keywords'])
        else:
            dict_expertise[doctor] = set(dataframe.loc[idx,'relevant_keywords'])
    return dict_expertise

def vectorize_keyword_expertise(dict_expertise):
    new_dict = {}
    for key,values in dict_expertise.items():
        lst_tuples = []
        for keyword in values:
            token_keyword = torch.tensor(scibert_tokenizer.encode(keyword)).unsqueeze(0)
            out = scibert_model(token_keyword)
            word_vector = out[0][0][1]
            tup = (keyword,word_vector)
            lst_tuples.append(tup)
        all_expertise_vectors = np.array([value[1].detach().numpy() for value in lst_tuples])
        mean_expertise_vector = np.mean(all_expertise_vectors,axis=0)
        new_dict[key] = {'average_expertise_vector':mean_expertise_vector,'keywords_expertise':lst_tuples}
    return new_dict

In [None]:
articles_df = pd.read_csv('articles_data.csv',index_col=0)
articles_df['doctors_mentioned'] = articles_df['body'].apply(extract_doctors)
articles_df = articles_df[articles_df["doctors_mentioned"].notnull()]
articles_df = articles_df[:50] #pour éviter que ça ne ralentisse trop
articles_df['relevant_keywords'] = articles_df['keywords'].apply(extract_relevant_keywords)
dict_expertise = create_dict_expertise(articles_df)
dict_expertise_with_average = vectorize_keyword_expertise(dict_expertise)

In [None]:
search_terms = 'climate change caused by chem trails'

token_keyword = torch.tensor(scibert_tokenizer.encode(search_terms)).unsqueeze(0)
out = scibert_model(token_keyword)
embeddings = out[0][0][1:-1]
average_embedding_request = np.mean(embeddings.detach().numpy(),axis=0)

max_cos_similarity = 0
corresponding_key = ''
for key,value in dict_expertise_with_average.items():
    cos_similarity = cosine_similarity(average_embedding_request.reshape(1,-1),value['average_expertise_vector'].reshape(1,-1))[0][0]
    if cos_similarity > max_cos_similarity:
        max_cos_similarity = cos_similarity
        corresponding_key = key
        
print(key, [elem[0] for elem in dict_expertise_with_average[key]['keywords_expertise']])
