In [85]:
import json
import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import itertools

In [3]:
path = 'articles_society.json'
with open(path) as infile:
    articles = json.loads(infile.read())

In [8]:
articles_by_location = dict()
for location in articles:
    articles_by_location[location] = [text['body'] for source in articles[location] for text in articles[location][source]['articles']['results']]

In [22]:
def preprocess_text(text):
    # lowercase
    text = text.lower()
    # remove accents
    text = unidecode.unidecode(text)
    return text.split()

In [25]:
articles_combined_by_location = dict()
for location in articles_by_location:
    articles_combined_by_location[location] = ' '.join(articles_by_location[location])

In [49]:
vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english')
tfidf_vectors = vectorizer.fit_transform(list(articles_combined_by_location.values()))

In [35]:
articles_combined_by_location.keys()

dict_keys(['Venezuela', 'France', 'Hong Kong', 'Iran'])

In [63]:
def extract_df_idf(index, vectorizer, tfidf_vectors):
    df = pd.DataFrame(tfidf_vectors[index].T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"])
    return df.sort_values(by=["tfidf"],ascending=False)

# VENEZUELA
venezuela_kw = extract_df_idf(0, vectorizer, tfidf_vectors)
# France
france_kw = extract_df_idf(1, vectorizer, tfidf_vectors)
# HONG KONG
hong_kong_kw = extract_df_idf(2, vectorizer, tfidf_vectors)
# IRAN
iran_kw = extract_df_idf(3, vectorizer, tfidf_vectors)

In [104]:
def clean_kw(kw_lists):
    kw_intersections = set()
    for list1, list2 in itertools.combinations(kw_lists, 2):
        kw_intersections = kw_intersections.union(set(list1).intersection(set(list2)))
#     print(kw_intersections)
    out_kw_lists = [list(kw_list) for kw_list in kw_lists]
    for i in range(len(kw_lists)):
        for elt in kw_lists[i]:
            if elt in kw_intersections:
#                 print('removing {} from {}'.format(elt, kw_list))
                out_kw_lists[i].remove(elt)
#                 print('kw_list {}'.format(kw_list))
    return out_kw_lists
    

In [109]:
num_kw = 200
cleaned_lists = clean_kw([
    venezuela_kw.index[:num_kw],
    france_kw.index[:num_kw],
    hong_kong_kw.index[:num_kw],
    iran_kw.index[:num_kw]
])

In [110]:
with open('cleaned_kw_lists.txt', 'w') as out_file:
    for kw_list in cleaned_lists:
        out_file.write(', '.join(kw_list) + '\n')

In [112]:
reviewed_kw = [['maduro', 'guaido', 'venezuela', 'caracas', 'venezuelan', 'juan', 'venezuelans', 'nicolas', 'lopez', 'chavez', 'cabello', 'leopoldo', 'diosdado', 'hyperinflation', 'latin', 'shortages', 'cucuta', 'hugo', 'padrino'],
['paris', 'macron', 'yellow', 'french', 'france', 'champs', 'elysees', 'vest', 'vests', 'arc', 'triomphe', 'emmanuel', 'avenue', 'christophe', 'castaner', 'jackets', 'philippe', 'elysee', 'marseille', 'michel', 'edouard'],
['hong', 'kong', 'china', 'lam', 'chinese', 'extradition', 'beijing', 'mainland', 'carrie', 'wong', 'long', 'yuen', 'cheung', 'chan', 'communist', 'xi', 'kowloon', 'xinjiang', 'kongers'],
['iran', 'iranian', 'tehran', 'soleimani', 'khamenei', 'irgc', 'ali', 'ayatollah', 'fadavi', 'drone', 'iranians', 'irbil', 'erbil', 'rouhani', 'mousavi', 'khuzestan', 'persian', 'zarif']]