# Key word extraction algorithm for claims

In [2]:
%load_ext autoreload
%autoreload 2


In [3]:
import os, json
from helper_function import count_valid_posts, get_claims, get_premises
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

In [4]:
with open('../data/v4/Should-I-invest-in-Bitcoin_with_labels_v4_70_filled.json', 'r') as f:
    file_content = json.load(f)
print('Get valid posts: ', count_valid_posts(file_content))
claims = get_claims(file_content)
premises = get_premises(file_content)


Get valid posts:  (41, 70)


## Keyword extraction algorithms

In [5]:
# from keyword_extractor import TfIdfExtractor

# if_idf = TfIdfExtractor()

In [6]:
# if_idf.extract_keywords(premises)

## KeyBERT Extractor

In [7]:
from keyword_extractor import KeyBERTExtractor

In [8]:
keybert_extractor = KeyBERTExtractor()

In [9]:
# keybert_extractor.extract_keywords(premises)

## Extract Keywords under a certain center claim

In [10]:
# first build a list of supporting premises
claims_with_support = {}
for l in get_premises(file_content, return_list_of_strings=False):
    # l is a list of dict objects
    for o in l:
        clm, prm = o['supportClaim'], o['content']
        if not clm in claims_with_support:
            claims_with_support[clm] = []
        claims_with_support[clm].append(prm)

claim_center_relation = {}
for l in get_claims(file_content, return_list_of_strings=False):
    # l is a list of dict objects
    for o in l:
        clm, sc = o['content'], o['claimCenter']
        if not sc in claim_center_relation:
            claim_center_relation[sc] = []
        claim_center_relation[sc].append(clm)

claimCenter_premise = {}
for cs, c_list in claim_center_relation.items():
    if not cs in claimCenter_premise:
        claimCenter_premise[cs] = []
    for c in c_list:
        if c in claims_with_support:
            claimCenter_premise[cs].extend(claims_with_support[c])


In [11]:
output_data = []
for k,v in claimCenter_premise.items():
    term = {
        'content' : k,
        'keywords' : []
    }
    kws = keybert_extractor.extract_keywords(' '.join(v), keep_term=10)
    kws.sort(key=lambda a: a[1], reverse=True)
    term['keywords'] = kws
    output_data.append(term)
    # break

In [12]:
claimCenter_premise['It is almost certainly in a bubble.']


['One of the proof cryptocurrencies are not ready yet, is the extremely fluctuating value that solely driven by “demand” and “potential”, instead of real-world implementation.']

In [13]:
# x = keybert_extractor.extract_keywords(' '.join(v), keep_term=15)
# x.sort(key=lambda a : a[1])
# x


In [14]:
with open('../data/newest_data(rolling update)/claim center and statistics/keywords.json', 'w') as f:
    json.dump(output_data, fp=f, indent=4)


In [15]:
keybert_extractor.extract_keywords(" ".join(claimCenter_premise['That’s up to you.']), keep_term=10)


[('cryptocurrencies', 0.5732),
 ('diversified', 0.3917),
 ('minority', 0.2698),
 ('portfolio', 0.4366),
 ('risky', 0.2879),
 ('position', 0.1183),
 ('generally', 0.169),
 ('bitcoin', 0.4283),
 ('assets', 0.3653),
 ('well', 0.0483)]

## Reformat

In [21]:
sentence_sentiment = {
    'neutral' : [
        "It’s not too late to invest.",
        "That’s up to you.",
        "It depends what your level of disposable income is, how great your assets are, and what other assets you have invested in.",
        "The significant thing is to do your own research and comprehend the dangers.",
        "Invest in Bitcoin, only if you are okay to loss all.",
        "Investing in Bitcoin is viable option especially in a view of current decline of the power of Fiat currencies.",
        "If you are willing to take the risk, first make sure you understand what you are investing in and have a crypto investment strategy",
    ],
    "positive" : [
        "I would say YES!",
        "Of course you should",
    ],
    "negative" : [
        "Bitcoin is pretty useless. But so is gold.",
        "Cryto currency is an extremely high-hazard venture, and CFDs bought on margin are significantly more hazardous.",
        "It is almost certainly in a bubble.",
    ]
}

for i in output_data:
    key = i['content']
    if not key in sentence_sentiment['neutral'] and not key in sentence_sentiment['positive'] and not key in sentence_sentiment['negative']:
        print(key)


In [29]:
formated_output = {
    'neutral' : {},
    'positive' : {},
    'negative' : {}
}

for key, value in sentence_sentiment.items():
    for index, cs in enumerate(value):
        o_index = next((i for i, s in enumerate(output_data) if s['content'] == cs), -1)
        formated_output[key][f'claim_{index}'] = output_data[o_index]['keywords']

In [30]:
formated_output

{'neutral': {'claim_0': [('bitcoin', 0.5662),
   ('investing', 0.4897),
   ('liquidity', 0.341),
   ('inflation', 0.3356),
   ('opportunity', 0.275),
   ('date', 0.1964),
   ('rethink', 0.1926),
   ('open', 0.1817),
   ('know', 0.1504),
   ('everything', 0.1037)],
  'claim_1': [('cryptocurrencies', 0.5732),
   ('portfolio', 0.4366),
   ('bitcoin', 0.4283),
   ('diversified', 0.3917),
   ('assets', 0.3653),
   ('risky', 0.2879),
   ('minority', 0.2698),
   ('generally', 0.169),
   ('position', 0.1183),
   ('well', 0.0483)],
  'claim_2': [('bitcoin', 0.5631),
   ('subjective', 0.3805),
   ('investment', 0.3733),
   ('value', 0.3442),
   ('gamble', 0.3276),
   ('volatile', 0.2612),
   ('risky', 0.2045),
   ('class', 0.1881),
   ('largely', 0.1521),
   ('forces', 0.0714)],
  'claim_3': [('investors', 0.488),
   ('cash', 0.4217),
   ('smart', 0.3561),
   ('encourage', 0.2686),
   ('lose', 0.2541),
   ('chance', 0.2214),
   ('consistently', 0.1785),
   ('stories', 0.1726),
   ('never', 0.152

In [35]:
with open('../data/newest_data(rolling update)/claim center and statistics/keywords_formatted_0814_1.json', 'w') as f:
    json.dump(formated_output, fp=f, indent=4)
