In [None]:
import json
import yaml
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
df_yes = pd.read_csv("Yes.csv")
df_discussion = pd.read_csv("Discussion.csv")

In [None]:
def fix_formatting(s):
    tokens = [t.strip() for t in s.lower().split(",")]
    tokens.sort()
    return tokens

df_yes["tokens"] = df_yes["Classification Keywords"].apply(lambda s: fix_formatting(s))
df_yes.head()

In [None]:
def tokens_to_dict(df, col):
    kw_dict = {}
    for _, row in df.iterrows():
        for t in row[col]:
            if t not in kw_dict:
                kw_dict[t] = 0
            kw_dict[t] +=1

    # kw_dict = {k: v for k, v in sorted(kw_dict.items(), key = lambda item: item[1])}
    kw_dict = {k: v for k, v in sorted(kw_dict.items(), key = lambda item: item[1], reverse = True)}
    
    return kw_dict

In [None]:
kw_dict = tokens_to_dict(df_yes, "tokens")
with open("kw_dict.json", "w") as f:
    json.dump(kw_dict, f, indent = 4)

In [None]:
with open("kw_changes.yml", "r") as f:
    kw_rules = yaml.load(f, Loader=yaml.FullLoader)

def fix_keywords(tokens, kw_rules):
    for k in kw_rules.keys():
        dest = kw_rules[k]["dest"]
        source = kw_rules[k]["source"]
        if dest == "to_delete":
            pass
        else:
            for source_kw in source:
                for t in tokens:
                    if source_kw == t:
                        tokens.remove(t)
                        for d in dest:
                            tokens.append(d)
    tokens = list(set(tokens))
    tokens.sort()
    return tokens

df_yes["tokens_agg"] = df_yes["tokens"].apply(lambda s: fix_keywords(s, kw_rules))

kw_dict_fix = tokens_to_dict(df_yes, "tokens_agg")
with open("kw_dict_fix.json", "w") as f:
    json.dump(kw_dict_fix, f, indent = 4)

In [None]:
df_yes.head()

In [None]:
kw_low_freq = [k for k in kw_dict_fix if (kw_dict_fix[k] < 2)]
len(kw_low_freq)

In [None]:
def check_delete(tokens, kw_low_freq):
    for t in tokens:
        if t in kw_low_freq or t == "to_delete":
            return True
    return False

df_yes["to_delete"] = df_yes["tokens_agg"].apply(lambda s: check_delete(s, kw_low_freq))

print("This many would be ignored: {}".format(len(df_yes[df_yes["to_delete"] == True])))
print("This many would be kept: {}".format(len(df_yes) - len(df_yes[df_yes["to_delete"] == True])))

In [None]:
df_keep = df_yes[df_yes["to_delete"] == False]
len(df_keep)

In [None]:
df_keep.sort_values(by="GSRank", inplace=True)
df_keep = df_keep.drop(["to_delete", "tokens", "Unnamed: 2", "Unnamed: 4"], axis=1)
df_keep.head()

In [None]:
df_keep.to_csv("ordered_articles.csv")

In [None]:
def remove_kw(tokens, kw_low_freq):
    new_tokens = tokens
    for t in tokens:
        if t in kw_low_freq or t == "to_delete":
            new_tokens.remove(t)
    return new_tokens

def check_delete_2(tokens):
    if len(tokens) > 0:
        return False
    else:
        return True

df_yes["tokens_agg_2"] = df_yes["tokens_agg"].apply(lambda s: remove_kw(s, kw_low_freq))
df_yes["to_delete_2"] = df_yes["tokens_agg_2"].apply(lambda s: check_delete_2(s))

print("This many would be ignored: {}".format(len(df_yes[df_yes["to_delete_2"] == True])))
print("This many would be kept: {}".format(len(df_yes) - len(df_yes[df_yes["to_delete_2"] == True])))