# Datahunt Aggregation Notebook
The purpose of this notebook is to collect rows associated with datahunt files associated with a specific article.
It reads in datahunt csvs and outputs a single csv corresponding to a single article with the following columns:/
* Credibility Indicator Category 	
* Question Number 	
* Answer Number 	
* Point Recommendation 	
* Credibility Indicator Name 	
* Start 	
* End

Currently, this datahunt files only contain useful information about article 100059

In [1]:
import json
import csv
import os
import glob
import pandas as pd
import numpy as np
import re
from collections import Counter

Read in all the datahunt files from the datahunt folder.

In [2]:
path =  'datahunts'
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

raw_data = pd.concat(li, axis=0, ignore_index=True)

In [3]:
raw_data['article_number'].unique()

array([100059])

In [4]:
if not (raw_data[(raw_data["start_pos"] == -1) & (raw_data["end_pos"] == -1)].head()).empty:
    print('Warning: there are rows in this file that have invalid start and end indices.'+
          'This means their data may correspond to invalid sections of the article.')

## Branch 1

We'll pick up the:

* Contributor ID column
* article's sha256
* article number
* question text
* answer text
* start of highlight in the article
* end of highlight in the article

In [5]:
clean_data = raw_data[["contributor_uuid", "article_sha256", "article_number", "question_text", "answer_text", "start_pos", "end_pos"]]
clean_data.head(3)

Unnamed: 0,contributor_uuid,article_sha256,article_number,question_text,answer_text,start_pos,end_pos
0,85579cf2-e01c-45c5-b9e7-34b40467148d,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,100059,Is a general or singular causal claim made? Hi...,"General Causation (In general, X causes Y.)",3197,3311
1,85579cf2-e01c-45c5-b9e7-34b40467148d,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,100059,What evidence is given for the primary causal ...,Correlation,3375,3384
2,85579cf2-e01c-45c5-b9e7-34b40467148d,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,100059,What evidence is given for the primary causal ...,Correlation,3375,3528


## Select Functions

In [6]:
# Selects rows with the inputted user_id. 
def select_user_id(df, user_id):
    return df[df["contributor_uuid"] == user_id]

In [7]:
# Selects rows with the inputted article_id. 
def select_article_id(df, article_id):
    return df[df["article_number"] == article_id]

In [8]:
# Takes out invalid start_pos and end_pos indices from the dataframe. 
def select_valid_indices(df):
    return df[(df["start_pos"] != 0) | (df["start_pos"] != -1) | (df["end_pos"] != 0) | (df["start_pos"] != -1)]

## Convert to CSV

In [9]:
# Converts dataframe to CSV.
def convert_to_csv(df, category, arg):
    if category == "user_id":
        df = select_user_id(df, arg)
        name = str(df["contributor_uuid"][0])
    elif category == "article_id":
        df = select_article_id(df, arg)
        name = str(df["article_number"][0])
    elif category == "valid_indices":
        df = select_valid_indices(df)
        name = "valid"
    else:
        raise ValueError("Invalid category type")
        
    # "[articleid]_[userid]_user_contributions.csv"
    df.to_csv("aggregate_datahunts/" + name + "_user_contributions.csv")

In [10]:
# Converts dataframe to CSV, putting article_number and contributor_uuid in the name. 
def convert_to_csv_user_article(df, user_id, article_id):
    df = select_user_id(df, user_id)
    df = select_article_id(df, article_id)
    df = select_valid_indices(df)
    name = str(df["article_number"][0]) + "_" + str(df["contributor_uuid"][0])
    df.to_csv("aggregate_datahunts/" + name + "_user_contributions.csv")

### Test

In [11]:
select_article_id(clean_data, 100059).head(3)

Unnamed: 0,contributor_uuid,article_sha256,article_number,question_text,answer_text,start_pos,end_pos
0,85579cf2-e01c-45c5-b9e7-34b40467148d,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,100059,Is a general or singular causal claim made? Hi...,"General Causation (In general, X causes Y.)",3197,3311
1,85579cf2-e01c-45c5-b9e7-34b40467148d,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,100059,What evidence is given for the primary causal ...,Correlation,3375,3384
2,85579cf2-e01c-45c5-b9e7-34b40467148d,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,100059,What evidence is given for the primary causal ...,Correlation,3375,3528


In [12]:
convert_to_csv(clean_data, "article_id", 100059)

FileNotFoundError: [Errno 2] No such file or directory: 'aggregate_datahunts/100059_user_contributions.csv'

In [None]:
convert_to_csv(clean_data, "user_id", "85579cf2-e01c-45c5-b9e7-34b40467148d")

In [13]:
clean_data[clean_data["article_number"] == 2005]

Unnamed: 0,contributor_uuid,article_sha256,article_number,question_text,answer_text,start_pos,end_pos


In [14]:
clean_data

Unnamed: 0,contributor_uuid,article_sha256,article_number,question_text,answer_text,start_pos,end_pos
0,85579cf2-e01c-45c5-b9e7-34b40467148d,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,100059,Is a general or singular causal claim made? Hi...,"General Causation (In general, X causes Y.)",3197,3311
1,85579cf2-e01c-45c5-b9e7-34b40467148d,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,100059,What evidence is given for the primary causal ...,Correlation,3375,3384
2,85579cf2-e01c-45c5-b9e7-34b40467148d,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,100059,What evidence is given for the primary causal ...,Correlation,3375,3528
3,85579cf2-e01c-45c5-b9e7-34b40467148d,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,100059,How representative is the sample of the popula...,Not at all representative,0,0
4,85579cf2-e01c-45c5-b9e7-34b40467148d,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,100059,What is the best alternative explanation you c...,"Even as the author says, secondary factors can...",0,0
...,...,...,...,...,...,...,...
1245,bd786026-bad5-4fa8-9a3a-38ca03a16412,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,100059,"Given the information you have, how likely is ...",Very Unlikely,0,0
1246,bd786026-bad5-4fa8-9a3a-38ca03a16412,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,100059,"Given the information you have, how likely is ...",Somewhat Unlikely,0,0
1247,bd786026-bad5-4fa8-9a3a-38ca03a16412,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,100059,Does the author suggest in any way that furthe...,"Yes, implicitly",1023,1106
1248,bd786026-bad5-4fa8-9a3a-38ca03a16412,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,100059,"How difficult was this task for you, on the wh...",6,0,0


In [15]:
convert_to_csv_user_article(clean_data, "85579cf2-e01c-45c5-b9e7-34b40467148d", 100059)

FileNotFoundError: [Errno 2] No such file or directory: 'aggregate_datahunts/100059_85579cf2-e01c-45c5-b9e7-34b40467148d_user_contributions.csv'

## Branch 2
### Points based on Topic Name, Question Number, Answer Number

In [16]:
weight_key = pd.read_csv('weight_key.csv')

In [17]:
"""
create_eta_datahunt will create Explore The Article datahunt csvs containing the 
predicted individual contribution for each question asked by Tagworks.
    @param raw_data: the dataframe returned after aggregating datahunt csvs
    @param weight_key: a weight key that connects a question and answer to a score
    @param article_number: the article to create the eta_datahunt file for
    @param contributor_id: the contributor requesting the data
    @return: None. Writes a dataframe of the proper format to be fed into Visualization.html. Contains the predicted point values and labels for the individual contributions to Tagworks. This csv file is in eta_datahunts.

"""
def create_eta_datahunt(raw_data, weight_key, article_number, contributor_id):
    raw_data = raw_data.loc[raw_data["article_number"] == article_number]
    if raw_data.empty:
        new_df = pd.DataFrame([["no_article", 0, 0, 0, 0, 0, 0]], columns=['Credibility Indicator Category', 'Question Number', 'Answer Number','Point Recommendation', 'Credibility Indicator Name', 'Start', 'End'])
    else:
        raw_data = raw_data.loc[raw_data["contributor_uuid"] == contributor_id]
        if raw_data.empty:
            new_df = pd.DataFrame([['no_user', 0, 0, 0, 0, 0, 0]], columns=['Credibility Indicator Category', 'Question Number', 'Answer Number','Point Recommendation', 'Credibility Indicator Name', 'Start', 'End'])
        else:
            new_df = pd.DataFrame()
            for row_num in raw_data.index:
                new_row = new_from_row(raw_data, row_num)
                if (new_row.empty):
                    continue
                else:
                    new_df = new_df.append(new_row)
    new_df.to_csv("eta_datahunts/" + str(article_number) + "_" +
                  str(contributor_id) +
                  "_user_contributions.csv")

In [18]:
"""

Extracts question and answer number of a topic. 
    @param df: the raw_data dataframe
    @param row_number: the row of df that we are trying to extract the question and answer from
    @return: the row from the df with the question and answer values appended to the row.
    

"""

def get_TopicQA(df, row_number):
    select_df = df[['topic_name', 'question_label', 'answer_label', 'start_pos', 'end_pos']].copy()
    select_df['topic_name'] = select_df['topic_name'].apply(lambda x: x.split(' ')[0])
    select_df['question_label'] = select_df['question_label'].apply(lambda x: re.findall('Q\d+', x)[0])
    select_df['answer_label'] = select_df['answer_label'].apply(lambda x: re.findall('A\d+', x)[0])
    
    return select_df[select_df.index == row_number]

need to get points and label from weight key csv
want point value (col F), topic (category), subset (col G - label) 
 - extract this from weight_key csv (do string parsing - slice out the “Specialist” part)
into new data frame —> csv — different csv name from previous ones

In [19]:
"""
Gets points and label from csv.
    @param df: the raw_data dataframe
    @param row_number: the row of df that we are trying to extract the question and answer from
    @return: the row from the df with the points, labels, start and end indices appended to the row.

"""

def new_from_row(df, row_number):
    
    TQA_row = get_TopicQA(df, row_number)
    TQA_question = int(TQA_row[TQA_row.index == row_number]['question_label'][row_number][1:])
    TQA_answer = int(TQA_row[TQA_row.index == row_number]['answer_label'][row_number][1:])
    TQA_schema = TQA_row[TQA_row.index == row_number]['topic_name'][row_number]
    new_df = weight_key[(weight_key['Question_Number'] == float(TQA_question)) & (weight_key['Answer_Number'] == float(TQA_answer)) & (weight_key['Schema'] == TQA_schema)]
    if new_df.empty:
        print("There is no algorithm output for question ", TQA_question, 
              ",answer ", TQA_answer, ", and category ", TQA_schema)
        return new_df
    else:
        new_df = new_df.drop(columns=['Question_Number_V2', 'Question_Type', 'Key Question', 'answer_uuid'])
        new_df = new_df.rename(columns={'Question_Number': 'Question Number', 'Answer_Number':'Answer Number', 'Point_Recommendation': 'Point Recommendation', "Schema": "Credibility Indicator Category", "Label": "Credibility Indicator Name"})
        new_df['Start'] = np.array(TQA_row['start_pos'])
        new_df['End'] = np.array(TQA_row['end_pos'])
        return new_df
    

In [20]:
create_eta_datahunt(raw_data, weight_key, 100059, "a")