# Semantics Aggregation Notebook
This notebook will search through relevant tagworks output files, specifically form files, and aggregate them based on article. In essence, this notebook maps all tagworks form files for a specific article to a single csv that contains the following columns:

* article id
* article sha256
* Credibility Indicator Category 
* Start 
* End 
* Case Number 
* Indices of Label in Article

Each row in the csv is represents a line in the article, which is either a quoted source, an assertion, an argument, or needs fact-checking

In [1]:
import json
import csv
import os
import glob
import pandas as pd
import numpy as np

In [2]:
def read_csv_from_path(path):
    return pd.read_csv(path)

In [3]:
raw_data = read_csv_from_path('Covid_Form1.0.adjudicated-2020-10-04T2314-Tags.csv')

In [4]:
raw_data.head()

Unnamed: 0,article_batch_name,article_number,article_filename,article_sha256,article_text_length,tua_group_uuid,tua_group_name,tua_batch_uuid,tua_batch_name,tua_batch_final,...,tua_uuid,namespace,topic_name,case_number,answer_uuid,extra,highlight_count,start_pos,end_pos,target_text
0,CovidArticles/USMightBeComplementingIran.txt,100054,USMightBeComplementingIran.txt,7360da3cdcf83a48e365821654ef0750810f3483efb8e2...,1870,a8428c88-cd9d-4318-a5cf-3fd0f25a9ce3,Covid_Form1.0.adjudicated,e52b4e75-0712-4fc6-98bf-d19cb253c381,Adjudicator Nick Adams HLTR source task 858 ar...,True,...,fc9c2896-5771-45f3-8541-0ddc30b5a589,Form.2020_03txt,Arguments,1,,{},1,1747,1868,"So far, as many as 988 people have died from t..."
1,CovidArticles/USMightBeComplementingIran.txt,100054,USMightBeComplementingIran.txt,7360da3cdcf83a48e365821654ef0750810f3483efb8e2...,1870,a8428c88-cd9d-4318-a5cf-3fd0f25a9ce3,Covid_Form1.0.adjudicated,e52b4e75-0712-4fc6-98bf-d19cb253c381,Adjudicator Nick Adams HLTR source task 858 ar...,True,...,6bd59682-6d58-4f7b-9df2-8a0913e404fc,Form.2020_03txt,Arguments,5,,{},2,1104,1397,The US reinstated its sanctions against Iran i...
2,CovidArticles/USMightBeComplementingIran.txt,100054,USMightBeComplementingIran.txt,7360da3cdcf83a48e365821654ef0750810f3483efb8e2...,1870,a8428c88-cd9d-4318-a5cf-3fd0f25a9ce3,Covid_Form1.0.adjudicated,e52b4e75-0712-4fc6-98bf-d19cb253c381,Adjudicator Nick Adams HLTR source task 858 ar...,True,...,6bd59682-6d58-4f7b-9df2-8a0913e404fc,Form.2020_03txt,Arguments,5,,{},2,1400,1488,"Washington, meanwhile, claims that it has exem..."
3,CovidArticles/USMightBeComplementingIran.txt,100054,USMightBeComplementingIran.txt,7360da3cdcf83a48e365821654ef0750810f3483efb8e2...,1870,a8428c88-cd9d-4318-a5cf-3fd0f25a9ce3,Covid_Form1.0.adjudicated,e52b4e75-0712-4fc6-98bf-d19cb253c381,Adjudicator Nick Adams HLTR source task 858 ar...,True,...,41406259-44fd-4345-a576-d5c29c08901b,Form.2020_03txt,Arguments,6,,{},2,1243,1397,Tehran sued Washington at the International Co...
4,CovidArticles/USMightBeComplementingIran.txt,100054,USMightBeComplementingIran.txt,7360da3cdcf83a48e365821654ef0750810f3483efb8e2...,1870,a8428c88-cd9d-4318-a5cf-3fd0f25a9ce3,Covid_Form1.0.adjudicated,e52b4e75-0712-4fc6-98bf-d19cb253c381,Adjudicator Nick Adams HLTR source task 858 ar...,True,...,41406259-44fd-4345-a576-d5c29c08901b,Form.2020_03txt,Arguments,6,,{},2,1543,1744,The Islamic Republic has written to the United...


In [5]:
raw_data.columns

Index(['article_batch_name', 'article_number', 'article_filename',
       'article_sha256', 'article_text_length', 'tua_group_uuid',
       'tua_group_name', 'tua_batch_uuid', 'tua_batch_name', 'tua_batch_final',
       'source_task_uuid', 'tua_uuid', 'namespace', 'topic_name',
       'case_number', 'answer_uuid', 'extra', 'highlight_count', 'start_pos',
       'end_pos', 'target_text'],
      dtype='object')

In [6]:
raw_data[['tua_group_name', 'tua_batch_name']]

Unnamed: 0,tua_group_name,tua_batch_name
0,Covid_Form1.0.adjudicated,Adjudicator Nick Adams HLTR source task 858 ar...
1,Covid_Form1.0.adjudicated,Adjudicator Nick Adams HLTR source task 858 ar...
2,Covid_Form1.0.adjudicated,Adjudicator Nick Adams HLTR source task 858 ar...
3,Covid_Form1.0.adjudicated,Adjudicator Nick Adams HLTR source task 858 ar...
4,Covid_Form1.0.adjudicated,Adjudicator Nick Adams HLTR source task 858 ar...
...,...,...
121,Covid_Form1.0.adjudicated,Adjudicator Nick Adams HLTR source task 866 ar...
122,Covid_Form1.0.adjudicated,Adjudicator Nick Adams HLTR source task 866 ar...
123,Covid_Form1.0.adjudicated,Adjudicator Nick Adams HLTR source task 866 ar...
124,Covid_Form1.0.adjudicated,Adjudicator Nick Adams HLTR source task 866 ar...


In [7]:
raw_data[['article_number', 'start_pos', 'end_pos', 'case_number']].head(10)

Unnamed: 0,article_number,start_pos,end_pos,case_number
0,100054,1747,1868,1
1,100054,1104,1397,5
2,100054,1400,1488,5
3,100054,1243,1397,6
4,100054,1543,1744,6
5,100054,337,445,1
6,100054,545,588,3
7,100054,152,333,4
8,100054,595,776,4
9,100054,888,1100,5


In [8]:
raw_data[['article_number', 'start_pos', 'end_pos', 'case_number']].apply(lambda x: list(range(x['start_pos'], x['end_pos'] + 1)), axis=1).head()

0    [1747, 1748, 1749, 1750, 1751, 1752, 1753, 175...
1    [1104, 1105, 1106, 1107, 1108, 1109, 1110, 111...
2    [1400, 1401, 1402, 1403, 1404, 1405, 1406, 140...
3    [1243, 1244, 1245, 1246, 1247, 1248, 1249, 125...
4    [1543, 1544, 1545, 1546, 1547, 1548, 1549, 155...
dtype: object

In [90]:
def simple_data_from_raw_data(raw_data, article_id):
    """
    Take article_number, start_pos, end_pos, and add Indices of Label in Article. Rename them if necesssary.
    Assumption: raw_data contains columns named article_number, start_pos, end_pos
    """
    
    default_file_name = "eta_forms/";
    file_name = default_file_name + str(article_id) + ".csv"
    
    simple_data = raw_data[['article_number', 'article_sha256', 'topic_name', 'start_pos', 'end_pos', 'case_number']]
    simple_data = simple_data.rename(columns = {'article_number': 'Article ID', 'article_sha256': 'Article sha256', 'topic_name': 'Credibility Indicator Category', 'start_pos': 'Start', 'end_pos': 'End', 'case_number': 'Case Number'})
     
    if simple_data[simple_data['Article ID'] == article_id].empty:
        new_df = pd.DataFrame([["no_article", 0, 0, 0, 0, 0, 0]], columns=['Credibility Indicator Category', 'Question Number', 'Answer Number','Point Recommendation', 'Credibility Indicator Name', 'Start', 'End'])            
        print("This article_id is not in the csv")
        new_df.to_csv(file_name)
        return new_df
        
    else:
        sub_simple_data = simple_data[simple_data['Article ID'] == article_id].copy()
        sub_simple_data['Indices of Label in Article'] = sub_simple_data.apply(lambda x: list(range(x['Start'], x['End'] + 1)), axis=1)
        sub_simple_data.to_csv(file_name)
        return sub_simple_data
    

In [91]:
simple_data = simple_data_from_raw_data(raw_data, 2005)
simple_data

Unnamed: 0,Article ID,Article sha256,Credibility Indicator Category,Start,End,Case Number,Indices of Label in Article
42,2005,47990959103662e94e796d979018922afddc880fb4b867...,Arguments,113,259,1,"[113, 114, 115, 116, 117, 118, 119, 120, 121, ..."
43,2005,47990959103662e94e796d979018922afddc880fb4b867...,Arguments,262,370,2,"[262, 263, 264, 265, 266, 267, 268, 269, 270, ..."
44,2005,47990959103662e94e796d979018922afddc880fb4b867...,Arguments,758,879,2,"[758, 759, 760, 761, 762, 763, 764, 765, 766, ..."
45,2005,47990959103662e94e796d979018922afddc880fb4b867...,Arguments,882,1512,2,"[882, 883, 884, 885, 886, 887, 888, 889, 890, ..."
46,2005,47990959103662e94e796d979018922afddc880fb4b867...,Arguments,1624,2436,2,"[1624, 1625, 1626, 1627, 1628, 1629, 1630, 163..."
47,2005,47990959103662e94e796d979018922afddc880fb4b867...,Arguments,2440,3146,3,"[2440, 2441, 2442, 2443, 2444, 2445, 2446, 244..."
48,2005,47990959103662e94e796d979018922afddc880fb4b867...,Assertions,3726,3811,1,"[3726, 3727, 3728, 3729, 3730, 3731, 3732, 373..."
49,2005,47990959103662e94e796d979018922afddc880fb4b867...,Assertions,3240,3291,2,"[3240, 3241, 3242, 3243, 3244, 3245, 3246, 324..."
50,2005,47990959103662e94e796d979018922afddc880fb4b867...,Needs Fact-Check,3294,3608,1,"[3294, 3295, 3296, 3297, 3298, 3299, 3300, 330..."
51,2005,47990959103662e94e796d979018922afddc880fb4b867...,Needs Fact-Check,3612,3724,2,"[3612, 3613, 3614, 3615, 3616, 3617, 3618, 361..."


In [71]:
simple_data['Article ID'].unique()

array([2005])

In [72]:
simple_data[simple_data['Article ID'] == 48].shape

(0, 7)

In [34]:
def convert_to_csv_per_article(simple_data, article_id):
    default_file_name = "eta_forms/";
    file_name = default_file_name + str(article_id) + ".csv"

    if simple_data[simple_data['Article ID'] == article_id].empty:
        new_df = pd.DataFrame([["no_article", 0, 0, 0, 0, 0, 0]], columns=['Credibility Indicator Category', 'Question Number', 'Answer Number','Point Recommendation', 'Credibility Indicator Name', 'Start', 'End'])
        print("This article_id is not in the csv")
        new_df.to_csv(file_name)
    else:
        sub_simple_data = simple_data[simple_data['Article ID'] == article_id]
        sub_simple_data.to_csv(file_name)

In [36]:
convert_to_csv_per_article(simple_data, 33)

This article_id is not in the csv
