# Key word extraction algorithm for claims

In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import os, json
from helper_function import count_valid_posts, get_claims, get_premises
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

In [3]:
data_file_path = '../data/v4/Should-I-invest-in-Bitcoin_with_labels_v4_70_filled.json'
data_file_path = '../data/v4/Would-you-get-into-a-self-driving-car_v4_70_filled.json'
data_file_path = "../data/v5/bitcoin-invest-aligned.json"
# data_file_path = "../data/v5/auto-driving-aligned.json"

with open(data_file_path, 'r') as f:
    file_content = json.load(f)
print('Get valid posts: ', count_valid_posts(file_content))
claims = get_claims(file_content)
premises = get_premises(file_content)


Get valid posts:  (41, 41)


## Keyword extraction algorithms

In [4]:
# from keyword_extractor import TfIdfExtractor

# if_idf = TfIdfExtractor()

In [5]:
# if_idf.extract_keywords(premises)

## KeyBERT Extractor

In [6]:
from keyword_extractor import KeyBERTExtractor

In [7]:
keybert_extractor = KeyBERTExtractor()

In [8]:
# keybert_extractor.extract_keywords(premises)

## Extract Keywords under a certain center claim

In [9]:
# first build a list of supporting premises
claims_with_support = {}
for l in get_premises(file_content, return_list_of_strings=False):
    # l is a list of dict objects
    for o in l:
        clm, prm = o['supportClaim'], o['content']
        if not clm in claims_with_support:
            claims_with_support[clm] = []
        claims_with_support[clm].append(prm)

claim_center_relation = {}
for l in get_claims(file_content, return_list_of_strings=False):
    # l is a list of dict objects
    for o in l:
        clm, sc = o['content'], o['claimCenter']
        if not sc in claim_center_relation:
            claim_center_relation[sc] = []
        claim_center_relation[sc].append(clm)

claimCenter_premise = {}
for cs, c_list in claim_center_relation.items():
    if not cs in claimCenter_premise:
        claimCenter_premise[cs] = []
    for c in c_list:
        if c in claims_with_support:
            claimCenter_premise[cs].extend(claims_with_support[c])


In [10]:
for k,v in claimCenter_premise.items():
    print(k, ": ", len(v))

It’s not too late to invest :  15
Bitcoin is like digital gold :  15
Invest in Bitcoin, only if you are okay to loss all. :  25
Bitcoin is a highly risky investment and not fit for everyone. :  17
If you're wondering whether you should invest in Bitcoin, the short answer is yes. :  12
Yes, you should. :  5
Think carefully about why you want to invest in cryptocurrency before you do so. :  11
Bitcoin makes cross-border payments possible, and also provides an easy way for people to escape failed government monetary policy :  6
It should be used to determine the trend: up, down, neutral :  2
This is massive :  0
The significant thing is to do your own research and comprehend the dangers :  5
It is almost certainly in a bubble. :  10
You can see the gains! :  1
this is your opportunity to learn from someone with over a decade of real world experience :  0


In [11]:
output_data = []
for k,v in claimCenter_premise.items():
    term = {
        'content' : k,
        'keywords' : []
    }
    kws = keybert_extractor.extract_keywords(' '.join(v), keep_term=10)
    kws.sort(key=lambda a: a[1], reverse=True)
    term['keywords'] = kws
    output_data.append(term)
    # break

In [12]:
# claimCenter_premise['It is almost certainly in a bubble.']


In [13]:
# x = keybert_extractor.extract_keywords(' '.join(v), keep_term=15)
# x.sort(key=lambda a : a[1])
# x


In [14]:
output_data_path = '../data/newest_data(rolling update)/claim center and statistics/bitcoin investment/keywords.json'
# output_data_path = '../data/newest_data(rolling update)/claim center and statistics/automos driving/keywords.json'
with open(output_data_path, 'w') as f:
    json.dump(output_data, fp=f, indent=4)


In [15]:
# keybert_extractor.extract_keywords(" ".join(claimCenter_premise['That’s up to you.']), keep_term=10)


## Reformat

In [16]:
# sentence_sentiment = {
#     'neutral' : [
#         "It’s not too late to invest.",
#         "That’s up to you.",
#         "It depends what your level of disposable income is, how great your assets are, and what other assets you have invested in.",
#         "The significant thing is to do your own research and comprehend the dangers.",
#         "Invest in Bitcoin, only if you are okay to loss all.",
#         "Investing in Bitcoin is viable option especially in a view of current decline of the power of Fiat currencies.",
#         "If you are willing to take the risk, first make sure you understand what you are investing in and have a crypto investment strategy",
#     ],
#     "positive" : [
#         "I would say YES!",
#         "Of course you should",
#     ],
#     "negative" : [
#         "Bitcoin is pretty useless. But so is gold.",
#         "Cryto currency is an extremely high-hazard venture, and CFDs bought on margin are significantly more hazardous.",
#         "It is almost certainly in a bubble.",
#     ]
# }

# with open('../data/newest_data(rolling update)/claim center and statistics/bitcoin investment/stance_count.json', 'w') as f:
#     json.dump(sentence_sentiment, f, indent=4)

sentence_sentiment_path = '../data/newest_data(rolling update)/claim center and statistics/bitcoin investment/stance_count.json'
# sentence_sentiment_path = '../data/newest_data(rolling update)/claim center and statistics/automos driving/stance_count.json'
with open(sentence_sentiment_path, 'r') as f:
    sentence_sentiment = json.load(f)

for i in output_data:
    key = i['content']
    if not key in sentence_sentiment['neutral'] and not key in sentence_sentiment['positive'] and not key in sentence_sentiment['negative']:
        print(key)


In [17]:
formated_output = {
    'neutral' : {},
    'positive' : {},
    'negative' : {}
}

for key, value in sentence_sentiment.items():
    for index, cs in enumerate(value):
        o_index = next((i for i, s in enumerate(output_data) if s['content'] == cs), -1)
        formated_output[key][f'claim_{index}'] = output_data[o_index]['keywords']

In [18]:
formated_output

{'neutral': {'claim_0': [('bitcoin', 0.5353),
   ('investing', 0.4592),
   ('liquidity', 0.3268),
   ('opportunities', 0.2963),
   ('launch', 0.2276),
   ('date', 0.222),
   ('rethink', 0.1615),
   ('volatile', 0.141),
   ('know', 0.1385),
   ('vessel', 0.1219)],
  'claim_1': [('bitcoin', 0.5359),
   ('inflationary', 0.4149),
   ('investment', 0.3422),
   ('scarcity', 0.3325),
   ('volatility', 0.2618),
   ('unpredictable', 0.2252),
   ('price', 0.2198),
   ('argue', 0.2024),
   ('2017', 0.2002),
   ('concept', 0.168)],
  'claim_2': [('futures', 0.4913),
   ('upside', 0.4035),
   ('forecasters', 0.3118),
   ('bitcoin', 0.2978),
   ('price', 0.1817),
   ('potential', 0.1637),
   ('reason', 0.1472),
   ('realistic', 0.1218),
   ('crash', 0.1109),
   ('first', 0.0378)],
  'claim_3': [('investing', 0.4438),
   ('coin', 0.442),
   ('hub', 0.3013),
   ('vest', 0.2871),
   ('lose', 0.1881),
   ('rise', 0.1653),
   ('exciting', 0.1541),
   ('class', 0.1508),
   ('varied', 0.1172),
   ('never',

In [19]:
support_statistic = {}
for stm, clm in sentence_sentiment.items():
    if not stm in support_statistic:
        support_statistic[stm] = 0
    for c in clm:
        support_statistic[stm] += len(claimCenter_premise[c])
support_statistic

{'positive': 12, 'neutral': 85, 'negative': 27}

In [20]:
from datetime import date
today = date.today()
d1 = today.strftime(f'%m%d')
d1

'0830'

In [21]:
formated_output_path = f'../data/newest_data(rolling update)/claim center and statistics/bitcoin investment/keywords_formatted_{d1}_1.json'
# formated_output_path = f'../data/newest_data(rolling update)/claim center and statistics/automos driving/keywords_formatted_{d1}_1.json'
with open(formated_output_path, 'w') as f:
    json.dump(formated_output, fp=f, indent=4)
