# Key word extraction algorithm for claims

In [3]:
import os, json
from helper_function import count_valid_posts, get_claims, get_premises
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

In [4]:
with open('../data/v3_70_filled/Should-I-invest-in-Bitcoin_with_labels_v3_70_filled.json', 'r') as f:
    file_content = json.load(f)
print('Get valid posts: ', count_valid_posts(file_content))
claims = get_claims(file_content)
premises = get_premises(file_content)

Get valid posts:  (41, 70)


## Keyword extraction algorithms

In [19]:
## TD-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
vectors = tf_idf_vectorizer.fit_transform(premises)
feature_names = tf_idf_vectorizer.get_feature_names()
dense = vectors.todense()


In [20]:
dense

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [21]:
np.argsort(dense, axis=1)

matrix([[  0, 501, 502, ..., 238, 369, 140],
        [  0, 499, 500, ..., 531, 695, 702],
        [  0, 499, 500, ..., 268, 556, 494],
        ...,
        [  0, 500, 501, ..., 760, 559, 603],
        [  0, 502, 503, ..., 250, 760, 443],
        [  0, 501, 502, ..., 105, 730, 249]])

In [22]:
x = np.asarray(np.argsort(dense, axis=1)[:, :-4:-1])


In [23]:
m = []
for i in x:
    feature = [feature_names[y] for y in i]
    m.append(feature)
m

[['closed', 'issue', 'everything'],
 ['understand', 'trouble', 'quickly'],
 ['phrase', 'research', 'focused'],
 ['short', 'term', 'longer'],
 ['sense', 'invest', 'jumped'],
 ['market', 'earn', 'studying'],
 ['gold', 'digital', 'like'],
 ['gold', 'long', 'less'],
 ['verified', 'addition', 'predictable'],
 ['created', 'bitcoins', 'see'],
 ['anywhere', 'world', 'account'],
 ['created', 'bitcoins', 'features'],
 ['failed', 'escape', 'border'],
 ['global', 'information', 'access'],
 ['market', 'price', 'investment'],
 ['hacking', 'exchanges', 'threat'],
 ['little', 'operates', 'stand'],
 ['besides', 'legitimate', 'accepted'],
 ['bankrupt', 'recover', 'corrupts'],
 ['useless', 'pretty', 'gold'],
 ['brand', 'gold', 'world'],
 ['world', 'one', 'fast'],
 ['banks', 'clients', 'hnw'],
 ['gold', 'cared', 'forget'],
 ['previous', 'upside', 'believe'],
 ['futures', 'inception', 'cap'],
 ['value', 'answer', 'considering'],
 ['dollars', 'ecosystem', 'increase'],
 ['addresses', 'active', 'institutional

In [1]:
## Keybert Extractor
from keybert import KeyBERT


In [2]:
keyBert_model = KeyBERT()

In [5]:
keyBert_model.extract_keywords(
    premises[0], keyphrase_ngram_range=(1, 1), stop_words=stopwords.words('english'))


[('bitcoin', 0.544),
 ('issue', 0.2982),
 ('rethink', 0.2884),
 ('irrelevant', 0.2783),
 ('mind', 0.2711)]

In [13]:
n = []
for i in premises:
    k = keyBert_model.extract_keywords(
        i, keyphrase_ngram_range=(1, 1), stop_words=stopwords.words('english'))
    n.append(k)


In [15]:
n

[[('bitcoin', 0.544),
  ('issue', 0.2982),
  ('rethink', 0.2884),
  ('irrelevant', 0.2783),
  ('mind', 0.2711)],
 [('bitcoin', 0.5795),
  ('stupid', 0.2992),
  ('concept', 0.2673),
  ('trouble', 0.2479),
  ('scam', 0.241)],
 [('buying', 0.4393),
  ('late', 0.3937),
  ('better', 0.341),
  ('time', 0.326),
  ('phrase', 0.3239)],
 [('bitcoin', 0.5695),
  ('investments', 0.3523),
  ('price', 0.2709),
  ('bought', 0.2561),
  ('sold', 0.2465)],
 [('bitcoin', 0.6451),
  ('cryptocurrency', 0.4745),
  ('invest', 0.3661),
  ('price', 0.3572),
  ('900', 0.2047)],
 [('cryptocurrency', 0.7045),
  ('crypto', 0.6357),
  ('bitcoin', 0.5919),
  ('currency', 0.404),
  ('decentralized', 0.3427)],
 [('bitcoin', 0.6835),
  ('gold', 0.4226),
  ('securely', 0.3601),
  ('digital', 0.2908),
  ('value', 0.2482)],
 [('gold', 0.5289),
  ('currency', 0.518),
  ('monetary', 0.4842),
  ('inflation', 0.4158),
  ('exchange', 0.3269)],
 [('bitcoin', 0.7381),
  ('bitcoins', 0.7235),
  ('monetary', 0.381),
  ('verified',

In [16]:
premises

['What bitcoin has done to date is irrelevant. If you haven’t bought any or don’t understand it then that’s not the issue. The issue is whether you are openminded enough to rethink everything you know about money and investing. If you are, there is opportunity.\n\nIf your mind is closed, the world is closed, and so are the opportunities.',
 'The trouble with bitcoin is it’s not easy to understand. When people don’t understand a concept they quickly dismiss it and call it a scam so they don’t feel stupid.',
 'You’re less worried about whether it’s “too late” or “is now a good time” and focused on doing your research and understanding what you’re buying. This phrase sums it up better than I could.',
 'One of my early investments in bitcoin was short term. I bought a lot and then sold it when the price crashed in 2017. That short-term thinking has meant I’m working a few years longer than I need to.',
 'The price of Bitcoin has jumped 18,749,900% since 2010. It made sense to invest in it 