# Key word extraction algorithm for claims

In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import os, json
from helper_function import count_valid_posts, get_claims, get_premises
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

In [3]:
data_file_path = '../data/v4/Should-I-invest-in-Bitcoin_with_labels_v4_70_filled.json'
data_file_path = '../data/v4/Would-you-get-into-a-self-driving-car_v4_70_filled.json'
with open(data_file_path, 'r') as f:
    file_content = json.load(f)
print('Get valid posts: ', count_valid_posts(file_content))
claims = get_claims(file_content)
premises = get_premises(file_content)


Get valid posts:  (67, 78)


## Keyword extraction algorithms

In [4]:
# from keyword_extractor import TfIdfExtractor

# if_idf = TfIdfExtractor()

In [5]:
# if_idf.extract_keywords(premises)

## KeyBERT Extractor

In [6]:
from keyword_extractor import KeyBERTExtractor

In [7]:
keybert_extractor = KeyBERTExtractor()

In [8]:
# keybert_extractor.extract_keywords(premises)

## Extract Keywords under a certain center claim

In [9]:
# first build a list of supporting premises
claims_with_support = {}
for l in get_premises(file_content, return_list_of_strings=False):
    # l is a list of dict objects
    for o in l:
        clm, prm = o['supportClaim'], o['content']
        if not clm in claims_with_support:
            claims_with_support[clm] = []
        claims_with_support[clm].append(prm)

claim_center_relation = {}
for l in get_claims(file_content, return_list_of_strings=False):
    # l is a list of dict objects
    for o in l:
        clm, sc = o['content'], o['claimCenter']
        if not sc in claim_center_relation:
            claim_center_relation[sc] = []
        claim_center_relation[sc].append(clm)

claimCenter_premise = {}
for cs, c_list in claim_center_relation.items():
    if not cs in claimCenter_premise:
        claimCenter_premise[cs] = []
    for c in c_list:
        if c in claims_with_support:
            claimCenter_premise[cs].extend(claims_with_support[c])


In [10]:
for k,v in claimCenter_premise.items():
    print(k, ": ", len(v))

Yes, I would. :  12
Not at the time of answering. :  8
Yes - in a few years it might be safer than me driving. :  9
Yes, I really would like to have a self-driving car. :  33
Yes. :  13
Yes, as soon as they work and are available at a reasonable price. :  6
No. It is too early in the technology to be able to trust the self-driving car. :  5
I love to drive on a nice day with little or no traffic. At all other times, I'll have my car drive for me. :  9
In the case I've described two paragraphs above, I'm really lost as to what fact based evidence could be used to chose the human. :  6
Nope. Absolutely not! :  8


In [11]:
output_data = []
for k,v in claimCenter_premise.items():
    term = {
        'content' : k,
        'keywords' : []
    }
    kws = keybert_extractor.extract_keywords(' '.join(v), keep_term=10)
    kws.sort(key=lambda a: a[1], reverse=True)
    term['keywords'] = kws
    output_data.append(term)
    # break

In [13]:
# claimCenter_premise['It is almost certainly in a bubble.']


In [14]:
# x = keybert_extractor.extract_keywords(' '.join(v), keep_term=15)
# x.sort(key=lambda a : a[1])
# x


In [15]:
output_data_path = '../data/newest_data(rolling update)/claim center and statistics/bitcoin investment/keywords.json'
output_data_path = '../data/newest_data(rolling update)/claim center and statistics/automos driving/keywords.json'
with open(output_data_path, 'w') as f:
    json.dump(output_data, fp=f, indent=4)


In [None]:
# keybert_extractor.extract_keywords(" ".join(claimCenter_premise['That’s up to you.']), keep_term=10)


## Reformat

In [18]:
# sentence_sentiment = {
#     'neutral' : [
#         "It’s not too late to invest.",
#         "That’s up to you.",
#         "It depends what your level of disposable income is, how great your assets are, and what other assets you have invested in.",
#         "The significant thing is to do your own research and comprehend the dangers.",
#         "Invest in Bitcoin, only if you are okay to loss all.",
#         "Investing in Bitcoin is viable option especially in a view of current decline of the power of Fiat currencies.",
#         "If you are willing to take the risk, first make sure you understand what you are investing in and have a crypto investment strategy",
#     ],
#     "positive" : [
#         "I would say YES!",
#         "Of course you should",
#     ],
#     "negative" : [
#         "Bitcoin is pretty useless. But so is gold.",
#         "Cryto currency is an extremely high-hazard venture, and CFDs bought on margin are significantly more hazardous.",
#         "It is almost certainly in a bubble.",
#     ]
# }

# with open('../data/newest_data(rolling update)/claim center and statistics/bitcoin investment/stance_count.json', 'w') as f:
#     json.dump(sentence_sentiment, f, indent=4)

sentence_sentiment_path = '../data/newest_data(rolling update)/claim center and statistics/bitcoin investment/stance_count.json'
sentence_sentiment_path = '../data/newest_data(rolling update)/claim center and statistics/automos driving/stance_count.json'
with open(sentence_sentiment_path, 'r') as f:
    sentence_sentiment = json.load(f)

for i in output_data:
    key = i['content']
    if not key in sentence_sentiment['neutral'] and not key in sentence_sentiment['positive'] and not key in sentence_sentiment['negative']:
        print(key)


In [19]:
formated_output = {
    'neutral' : {},
    'positive' : {},
    'negative' : {}
}

for key, value in sentence_sentiment.items():
    for index, cs in enumerate(value):
        o_index = next((i for i, s in enumerate(output_data) if s['content'] == cs), -1)
        formated_output[key][f'claim_{index}'] = output_data[o_index]['keywords']

In [20]:
formated_output

{'neutral': {'claim_0': [('driverless', 0.5171),
   ('driving', 0.4749),
   ('uber', 0.3602),
   ('traffic', 0.2889),
   ('self', 0.2729),
   ('safer', 0.2714),
   ('blind', 0.2163),
   ('guide', 0.1279),
   ('crashes', 0.111),
   ('average', 0.0119)],
  'claim_1': [('driving', 0.4354),
   ('parking', 0.38),
   ('commuting', 0.3387),
   ('self', 0.2139),
   ('expensive', 0.1889),
   ('cab', 0.1598),
   ('unmanned', 0.1371),
   ('san', 0.1347),
   ('cool', 0.0521),
   ('never', 0.0418)],
  'claim_2': [('driving', 0.4886),
   ('self', 0.3348),
   ('legal', 0.1524),
   ('safe', 0.1274),
   ('urban', 0.11),
   ('handle', 0.1008),
   ('meetings', 0.0982),
   ('emails', 0.0896),
   ('would', 0.0499),
   ('liabilities', 0.0466)],
  'claim_3': [('automobiles', 0.4353),
   ('autopilot', 0.3282),
   ('dangers', 0.3203),
   ('inattentive', 0.3114),
   ('maintenance', 0.2775),
   ('lifetimes', 0.2632),
   ('dystopia', 0.2321),
   ('mirror', 0.1936),
   ('crashes', 0.1684),
   ('react', 0.1157)]},


In [21]:
support_statistic = {}
for stm, clm in sentence_sentiment.items():
    if not stm in support_statistic:
        support_statistic[stm] = 0
    for c in clm:
        support_statistic[stm] += len(claimCenter_premise[c])
support_statistic

{'positive': 30, 'negative': 19, 'neutral': 60}

In [25]:
from datetime import date
today = date.today()
d1 = today.strftime(f'%m%d')
d1

'0820'

In [28]:
formated_output_path = f'../data/newest_data(rolling update)/claim center and statistics/bitcoin investment/keywords_formatted_{d1}_1.json'
formated_output_path = f'../data/newest_data(rolling update)/claim center and statistics/automos driving/keywords_formatted_{d1}_1.json'
with open(formated_output_path, 'w') as f:
    json.dump(formated_output, fp=f, indent=4)
