# Key word extraction algorithm for claims

In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import os, json
from helper_function import count_valid_posts, get_claims, get_premises
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

In [3]:
with open('../data/v4/Should-I-invest-in-Bitcoin_with_labels_v4_70_filled.json', 'r') as f:
    file_content = json.load(f)
print('Get valid posts: ', count_valid_posts(file_content))
claims = get_claims(file_content)
premises = get_premises(file_content)


Get valid posts:  (41, 70)


## Keyword extraction algorithms

In [4]:
# from keyword_extractor import TfIdfExtractor

# if_idf = TfIdfExtractor()

In [5]:
# if_idf.extract_keywords(premises)

## KeyBERT Extractor

In [6]:
from keyword_extractor import KeyBERTExtractor

In [7]:
keybert_extractor = KeyBERTExtractor()

In [8]:
# keybert_extractor.extract_keywords(premises)

## Extract Keywords under a certain center claim

In [9]:
# first build a list of supporting premises
claims_with_support = {}
for l in get_premises(file_content, return_list_of_strings=False):
    # l is a list of dict objects
    for o in l:
        clm, prm = o['supportClaim'], o['content']
        if not clm in claims_with_support:
            claims_with_support[clm] = []
        claims_with_support[clm].append(prm)

claim_center_relation = {}
for l in get_claims(file_content, return_list_of_strings=False):
    # l is a list of dict objects
    for o in l:
        clm, sc = o['content'], o['claimCenter']
        if not sc in claim_center_relation:
            claim_center_relation[sc] = []
        claim_center_relation[sc].append(clm)

claimCenter_premise = {}
for cs, c_list in claim_center_relation.items():
    if not cs in claimCenter_premise:
        claimCenter_premise[cs] = []
    for c in c_list:
        if c in claims_with_support:
            claimCenter_premise[cs].extend(claims_with_support[c])


In [10]:
output_data = []
for k,v in claimCenter_premise.items():
    term = {
        'content' : k,
        'keywords' : []
    }
    kws = keybert_extractor.extract_keywords(' '.join(v), keep_term=10)
    kws.sort(key=lambda a: a[1], reverse=True)
    term['keywords'] = kws
    output_data.append(term)
    # break

In [21]:
claimCenter_premise['It is almost certainly in a bubble.']


[]

In [12]:
# x = keybert_extractor.extract_keywords(' '.join(v), keep_term=15)
# x.sort(key=lambda a : a[1])
# x


In [13]:
with open('../data/newest_data(rolling update)/claim center and statistics/keywords.json', 'w') as f:
    json.dump(output_data, fp=f, indent=4)


In [20]:
keybert_extractor.extract_keywords(" ".join(claimCenter_premise['That’s up to you.']), keep_term=10)


[('cryptocurrencies', 0.5732),
 ('diversified', 0.3917),
 ('minority', 0.2698),
 ('portfolio', 0.4366),
 ('risky', 0.2879),
 ('position', 0.1183),
 ('generally', 0.169),
 ('bitcoin', 0.4283),
 ('assets', 0.3653),
 ('well', 0.0483)]