# Process Web Json and Find Center claims

* Load Web Json file
* Extract claims and premises (data parsing)
* Find center claims

In [None]:
import json
import os

## Load web data and extract claims

In [None]:
from helper_function import count_valid_posts, get_claims
import numpy as np

In [None]:
# data_file_path = '../data/v3_70/Should-I-invest-in-Bitcoin_with_labels_v2_70.json'
# data_file_path = '../data/v3_70/Would-you-get-into-a-self-driving-car.json'
data_file_path = '../data/v3_70/auto-driving-aligned.json'
# data_file_path = '../data/v3_70/bitcoin-invest-aligned.json'
with open(data_file_path, 'r') as f:
    json_content = json.load(f)
count_valid_posts(json_content)


In [None]:
claims = get_claims(json_content)
print(f"get {len(claims)} claims")
claims

## Sentiment Analysis

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

sentiment_model_path = '../pretrained_models/twitter-xlm-roberta-base-sentiment'
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_path)
sentiment_config = AutoConfig.from_pretrained(sentiment_model_path)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_path)


In [None]:
## get sentiment score for each sentence

preprocessed_claims = [preprocess(x) for x in claims]
sentiment_encodings = sentiment_tokenizer(preprocessed_claims, return_tensors='pt', padding=True, truncation=True, max_length=20)
with torch.no_grad():
    sentiment_output = sentiment_model(**sentiment_encodings)
sentiment_scores = sentiment_output.logits
sentiment_probability = F.softmax(sentiment_scores, dim=1).numpy()

## positive, neutral, negative
print(sentiment_config.id2label)

In [None]:
## positive, neutral, negative
sentiment_probability[:5]

## Get center claims

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('../pretrained_models/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('../pretrained_models/all-MiniLM-L6-v2')


In [None]:
encoded_input = tokenizer(claims, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    model_output = model(**encoded_input)
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
# Normalization
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1).numpy()

In [None]:
sentence_embeddings = np.array(sentence_embeddings)
sentence_embeddings.shape

In [None]:
### using PCA method to reduce demension and do visualization
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
pca = PCA(n_components=2)
x_pca = pca.fit_transform(sentence_embeddings)
explained_variance = pca.explained_variance_ratio_
print("Explained variance = ",sum(explained_variance))
plt.scatter(x = x_pca.T[0], y=x_pca.T[1])
plt.show()

## Clustering

In [None]:
# clustering
from sklearn.cluster import SpectralClustering, AffinityPropagation

In [None]:
# use algorithms to reduce demension

pca = PCA()
sentence_embeddings_distilled = pca.fit_transform(sentence_embeddings)

In [None]:
# concate with sentiment scores
# decided not to use it
# sentence_embeddings_distilled = np.concatenate((sentence_embeddings_distilled, sentiment_probability), axis=1)
sentence_embeddings_distilled.shape

In [None]:
cluster_algo = AffinityPropagation(random_state=43)
# cluster_algo = SpectralClustering(random_state=43, n_clusters=18)
# cluster = cluster_algo.fit(sentence_embeddings)
cluster = cluster_algo.fit(sentence_embeddings_distilled)
print("Clustered into {} clusters with labels {}".format(
    np.unique(cluster.labels_).shape, np.unique(cluster.labels_)))


In [None]:
unique, count = np.unique(cluster.labels_, return_counts=True)
print(dict(zip(*np.unique(cluster.labels_, return_counts=True))))


In [None]:
claim_clusters = {i: [] for i in range(len(np.unique(cluster.labels_)))}
for i,label in enumerate(cluster.labels_):
    claim_clusters[label].append(claims[i])
claim_clusters

In [None]:
claimCenter_polarty = {}
center_claims_relation = {}
print(sentiment_config.id2label)
if 'Yes' in claimCenter_polarty:
    claimCenter_polarty['Yes'] = 'Positive'
for i,term in enumerate(cluster.cluster_centers_indices_):
    center_claims_relation[i] = claims[term]
    print(f"{i}:\t{claims[term]}", end='\t')
    polarity_index = np.argmax(sentiment_probability[term])
    print(sentiment_config.id2label[polarity_index], sentiment_probability[term])
    claimCenter_polarty[claims[term]] = sentiment_config.id2label[polarity_index]


## Assign center claims

In [None]:
count = 0
for i in json_content['answers']:
    if i:
        for j in i['claim']:
            claim_center = claims[cluster.cluster_centers_indices_[
                cluster.labels_[count]]]
            j['claimCenter'] = claim_center
            j['claimSentiment'] = claimCenter_polarty[claim_center]
            count += 1


In [None]:
claim_center

In [None]:
# output_file_path = '../data/v3_70_filled/Should-I-invest-in-Bitcoin_with_labels_v3_70_filled.json'
# output_file_path = '../data/v3_70_filled/Would-you-get-into-a-self-driving-car_v3_70_filled.json'
output_file_path = '../data/v3_70_filled/auto-driving-aligned.json'
# output_file_path = '../data/v3_70_filled/bitcoin-invest-aligned.json'
with open(output_file_path, 'w') as f:
    json.dump(json_content, fp=f, indent=4, ensure_ascii=False)


In [None]:
task = "automos driving"
# task = "bitcoin investment"
with open(f"../data/newest_data(rolling update)/claim center and statistics/{task}/claim_clusters.json", 'w') as f:
    json.dump(claim_clusters, f, indent=4, ensure_ascii=False)
with open(f'../data/newest_data(rolling update)/claim center and statistics/{task}/stance_count.json', 'w') as f:
    inv_map = {}
    for k, v in claimCenter_polarty.items():
        v = v.lower()
        inv_map[v] = inv_map.get(v, []) + [k]
    json.dump(inv_map,fp=f, indent=4, ensure_ascii=False)
with open(f'../data/newest_data(rolling update)/claim center and statistics/{task}/center_claims_relation.json', 'w') as f:
    json.dump(center_claims_relation, f, indent=4, ensure_ascii=False)
