In [10]:
import os


def print_directory_structure(path, level=0):
    try:
        for item in os.listdir(path):
            item_path = os.path.join(path, item)
            if os.path.isdir(item_path):
                print('  ' * level + f"[DIR] {item}")
                print_directory_structure(item_path, level + 1)
            else:
                print('  ' * level + f"[FILE] {item}")
    except PermissionError:
        print('  ' * level + "[Permission Denied]")


start_path = "../dataset"
print_directory_structure(start_path)

[DIR] data
  [FILE] test-00000-of-00001.parquet
  [FILE] train-00000-of-00001.parquet
[FILE] dataset_dict.json
[DIR] full_test_set
  [FILE] dataset_dict.json
  [DIR] train
    [FILE] data-00000-of-00001.arrow
    [FILE] dataset_info.json
    [FILE] state.json
[DIR] full_train_set
  [FILE] dataset_dict.json
  [DIR] train
    [FILE] data-00000-of-00001.arrow
    [FILE] dataset_info.json
    [FILE] state.json
[DIR] test_set
  [FILE] dataset_dict.json
  [DIR] train
    [FILE] data-00000-of-00001.arrow
    [FILE] dataset_info.json
    [FILE] state.json
[DIR] train
  [FILE] dataset_dict.json
  [DIR] train
    [FILE] data-00000-of-00001.arrow
    [FILE] dataset_info.json
    [FILE] state.json


In [11]:
from datasets import Dataset, DatasetDict

root = "../dataset"

ds = DatasetDict({
    "train": Dataset.from_file(f"{root}/train/train/data-00000-of-00001.arrow"),
    "test_set": Dataset.from_file(f"{root}/test_set/train/data-00000-of-00001.arrow"),
    "full_train_set": Dataset.from_file(f"{root}/full_train_set/train/data-00000-of-00001.arrow"),
    "full_test_set": Dataset.from_file(f"{root}/full_test_set/train/data-00000-of-00001.arrow"),
})

print(ds)

DatasetDict({
    train: Dataset({
        features: ['question', 'interview_question', 'interview_answer', 'label', 'url', 'inaudible', 'multiple_questions', 'affirmative_questions'],
        num_rows: 3448
    })
    test_set: Dataset({
        features: ['Unnamed: 0', 'question', 'interview_question', 'interview_answer', 'label', 'url', 'Annotator1', 'Annotator2', 'Annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions'],
        num_rows: 308
    })
    full_train_set: Dataset({
        features: ['title', 'date', 'president', 'url', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'label', 'annotator_id', 'inaudible', 'multiple_questions', 'affirmative_questions'],
        num_rows: 3448
    })
    full_test_set: Dataset({
        features: ['Unnamed: 0', 'interview_question', 'interview_answer', 'question', 'Annotator1', 'Annotator2', 'Annotator3', 'Label', 'url', 'inaudible', 'multiple_questions', 'affirmative_questions

In [12]:
import pandas as pd

df_train = ds["train"].to_pandas()
df_test = ds["test_set"].to_pandas()
df_full_train = ds["full_train_set"].to_pandas()
df_full_test = ds["full_test_set"].to_pandas()

In [13]:
print("Train Set Info:")

print("\n\nInfo:")
print(df_train.info())

print("\n\nShape:")
print(df_train.shape)

print("\n\nHead:")
df_train.head()

Train Set Info:


Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3448 entries, 0 to 3447
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   question               3448 non-null   object
 1   interview_question     3448 non-null   object
 2   interview_answer       3448 non-null   object
 3   label                  3448 non-null   object
 4   url                    3448 non-null   object
 5   inaudible              3448 non-null   bool  
 6   multiple_questions     3448 non-null   bool  
 7   affirmative_questions  3448 non-null   bool  
dtypes: bool(3), object(5)
memory usage: 144.9+ KB
None


Shape:
(3448, 8)


Head:


Unnamed: 0,question,interview_question,interview_answer,label,url,inaudible,multiple_questions,affirmative_questions
0,How would you respond to the accusation that t...,Q. Of the Biden administration. And accused th...,"Well, look, first of all, theI am sincere abou...",Explicit,https://www.presidency.ucsb.edu/documents/the-...,False,False,False
1,Do you think President Xi is being sincere abo...,Q. Of the Biden administration. And accused th...,"Well, look, first of all, theI am sincere abou...",General,https://www.presidency.ucsb.edu/documents/the-...,False,False,False
2,Do you believe the country's slowdown and gro...,Q. No worries. Do you believe the country's sl...,"Look, I think China has a difficult economic p...",Partial/half-answer,https://www.presidency.ucsb.edu/documents/the-...,False,False,False
3,Are you worried about the meeting between Pre...,Q. No worries. Do you believe the country's sl...,"Look, I think China has a difficult economic p...",Dodging,https://www.presidency.ucsb.edu/documents/the-...,False,False,False
4,Is the President's engagement with Asian coun...,"Q. I can imagine. It is evening, I'd like to r...","Well, I hope I get to see Mr. Xi sooner than l...",Explicit,https://www.presidency.ucsb.edu/documents/the-...,False,False,False


In [14]:
print("Test Set Info:")

print("\n\nInfo:")
print(df_test.info())

print("\n\nShape:")
print(df_test.shape)

print("\n\nHead:")
df_test.head()

Test Set Info:


Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308 entries, 0 to 307
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Unnamed: 0             308 non-null    int64 
 1   question               308 non-null    object
 2   interview_question     308 non-null    object
 3   interview_answer       308 non-null    object
 4   label                  308 non-null    object
 5   url                    308 non-null    object
 6   Annotator1             308 non-null    object
 7   Annotator2             308 non-null    object
 8   Annotator3             308 non-null    object
 9   inaudible              308 non-null    bool  
 10  multiple_questions     308 non-null    bool  
 11  affirmative_questions  308 non-null    bool  
dtypes: bool(3), int64(1), object(8)
memory usage: 22.7+ KB
None


Shape:
(308, 12)


Head:


Unnamed: 0.1,Unnamed: 0,question,interview_question,interview_answer,label,url,Annotator1,Annotator2,Annotator3,inaudible,multiple_questions,affirmative_questions
0,0,Inquiring about the status or information reg...,"Q. What about the redline, sir?","Well, the world has made it clear that these t...",Indirect,https://www.presidency.ucsb.edu/documents/the-...,Dodging,General,Dodging,False,False,True
1,1,Will you invite them to the White House to neg...,Q. Will you invite them to the White House to ...,I think that anytime and anyplace that they ar...,Indirect,https://www.presidency.ucsb.edu/documents/the-...,Deflection,General,General,False,False,False
2,2,Why was it necessary for Japan to drop the thr...,"Q. Harsh. Mr. President, Japan has dropped the...",I think that the purpose of the U.N. Security ...,Indirect,https://www.presidency.ucsb.edu/documents/the-...,Explicit,Implicit,Implicit,False,False,False
3,3,When will we see this resolution?,Q. The Lebanese Prime Minister is demanding a ...,I'll let Condi talk about the details of what ...,Indirect,https://www.presidency.ucsb.edu/documents/the-...,Explicit,General,General,False,False,False
4,4,Updating the figure of Iraqi deaths,"Q. Thank you, Mr. President. Back on Iraq, a g...","No, I don't consider it a credible report; nei...",Indirect,https://www.presidency.ucsb.edu/documents/the-...,Dodging,Implicit,Dodging,False,False,True


In [15]:
print("Full Train Set Info:")

print("\n\nInfo:")
print(df_full_train.info())

print("\n\nShape:")
print(df_full_train.shape)

print("\n\nHead:")
df_full_train.head()

Full Train Set Info:


Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3448 entries, 0 to 3447
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   title                  3448 non-null   object
 1   date                   3448 non-null   object
 2   president              3448 non-null   object
 3   url                    3448 non-null   object
 4   interview_question     3448 non-null   object
 5   interview_answer       3448 non-null   object
 6   gpt3.5_summary         3448 non-null   object
 7   gpt3.5_prediction      3448 non-null   object
 8   question               3448 non-null   object
 9   label                  3448 non-null   object
 10  annotator_id           3448 non-null   int64 
 11  inaudible              3448 non-null   bool  
 12  multiple_questions     3448 non-null   bool  
 13  affirmative_questions  3448 non-null   bool  
dtypes: bool(3), int64(1), object(10)
memory usa

Unnamed: 0,title,date,president,url,interview_question,interview_answer,gpt3.5_summary,gpt3.5_prediction,question,label,annotator_id,inaudible,multiple_questions,affirmative_questions
0,"The President's News Conference in Hanoi, Vietnam","September 10, 2023",Joseph R. Biden,https://www.presidency.ucsb.edu/documents/the-...,Q. Of the Biden administration. And accused th...,"Well, look, first of all, theI am sincere abou...",The question consists of 2 parts: \n1. How wou...,Question part: 1. How would you respond to the...,How would you respond to the accusation that t...,Explicit,85,False,False,False
1,"The President's News Conference in Hanoi, Vietnam","September 10, 2023",Joseph R. Biden,https://www.presidency.ucsb.edu/documents/the-...,Q. Of the Biden administration. And accused th...,"Well, look, first of all, theI am sincere abou...",The question consists of 2 parts: \n1. How wou...,Question part: 1. How would you respond to the...,Do you think President Xi is being sincere abo...,General,85,False,False,False
2,"The President's News Conference in Hanoi, Vietnam","September 10, 2023",Joseph R. Biden,https://www.presidency.ucsb.edu/documents/the-...,Q. No worries. Do you believe the country's sl...,"Look, I think China has a difficult economic p...",The question consists of two parts:\n\n1. Q1: ...,Question part: Q1 - Do you believe the country...,Do you believe the country's slowdown and gro...,Partial/half-answer,85,False,False,False
3,"The President's News Conference in Hanoi, Vietnam","September 10, 2023",Joseph R. Biden,https://www.presidency.ucsb.edu/documents/the-...,Q. No worries. Do you believe the country's sl...,"Look, I think China has a difficult economic p...",The question consists of two parts:\n\n1. Q1: ...,Question part: Q1 - Do you believe the country...,Are you worried about the meeting between Pre...,Dodging,85,False,False,False
4,"The President's News Conference in Hanoi, Vietnam","September 10, 2023",Joseph R. Biden,https://www.presidency.ucsb.edu/documents/the-...,"Q. I can imagine. It is evening, I'd like to r...","Well, I hope I get to see Mr. Xi sooner than l...",The question consists of 3 parts:\n1. Is the P...,Question part: 1. Is the President's engagemen...,Is the President's engagement with Asian coun...,Explicit,85,False,False,False


In [16]:
print("Full Test Set Info:")

print("\n\nInfo:")
print(df_full_test.info())

print("\n\nShape:")
print(df_full_test.shape)

print("\n\nHead:")
df_full_test.head()

Full Test Set Info:


Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308 entries, 0 to 307
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Unnamed: 0             308 non-null    int64 
 1   interview_question     308 non-null    object
 2   interview_answer       308 non-null    object
 3   question               308 non-null    object
 4   Annotator1             308 non-null    object
 5   Annotator2             308 non-null    object
 6   Annotator3             308 non-null    object
 7   Label                  308 non-null    object
 8   url                    308 non-null    object
 9   inaudible              308 non-null    bool  
 10  multiple_questions     308 non-null    bool  
 11  affirmative_questions  308 non-null    bool  
dtypes: bool(3), int64(1), object(8)
memory usage: 22.7+ KB
None


Shape:
(308, 12)


Head:


Unnamed: 0.1,Unnamed: 0,interview_question,interview_answer,question,Annotator1,Annotator2,Annotator3,Label,url,inaudible,multiple_questions,affirmative_questions
0,0,"Q. What about the redline, sir?","Well, the world has made it clear that these t...",Inquiring about the status or information reg...,Dodging,General,Dodging,Indirect,https://www.presidency.ucsb.edu/documents/the-...,False,False,True
1,1,Q. Will you invite them to the White House to ...,I think that anytime and anyplace that they ar...,Will you invite them to the White House to neg...,Deflection,General,General,Indirect,https://www.presidency.ucsb.edu/documents/the-...,False,False,False
2,2,"Q. Harsh. Mr. President, Japan has dropped the...",I think that the purpose of the U.N. Security ...,Why was it necessary for Japan to drop the thr...,Explicit,Implicit,Implicit,Indirect,https://www.presidency.ucsb.edu/documents/the-...,False,False,False
3,3,Q. The Lebanese Prime Minister is demanding a ...,I'll let Condi talk about the details of what ...,When will we see this resolution?,Explicit,General,General,Indirect,https://www.presidency.ucsb.edu/documents/the-...,False,False,False
4,4,"Q. Thank you, Mr. President. Back on Iraq, a g...","No, I don't consider it a credible report; nei...",Updating the figure of Iraqi deaths,Dodging,Implicit,Dodging,Indirect,https://www.presidency.ucsb.edu/documents/the-...,False,False,True


In [17]:
for split_name, split in ds.items():
    print("\n==============================")
    print(f"Split: {split_name}")
    print("==============================")

    df = split.to_pandas()

    print("Columns:")
    print(df.columns.tolist())

    print("\nFirst row:")
    print(df.iloc[0])


Split: train
Columns:
['question', 'interview_question', 'interview_answer', 'label', 'url', 'inaudible', 'multiple_questions', 'affirmative_questions']

First row:
question                 How would you respond to the accusation that t...
interview_question       Q. Of the Biden administration. And accused th...
interview_answer         Well, look, first of all, theI am sincere abou...
label                                                             Explicit
url                      https://www.presidency.ucsb.edu/documents/the-...
inaudible                                                            False
multiple_questions                                                   False
affirmative_questions                                                False
Name: 0, dtype: object

Split: test_set
Columns:
['Unnamed: 0', 'question', 'interview_question', 'interview_answer', 'label', 'url', 'Annotator1', 'Annotator2', 'Annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions'

## df_train
### real training data

Contains:
* the questions
* the answer
* clarity label
    - explicit / indirect / evasive
* few helpful tags (inaudible, etc)

To be used in **Task 1 (Clarity Classification)** - train the model


## df_test
### public test set

Contains:
* clarity label
* evasive labels (Annotator 1,2,3)

To be used in both **Task 1 (Clarity Classification) and Task 2 (Evasive Technique Classification)** - test and evaluate the model


## df_full_train
### train + extra info

Same as train but with:
* president name
* date
* GPT Summary
* GPT Prediction
* interview title
* annotator id

To be used in **Task 1 (Clarity Classification)** - train the model, still cannot be used for **Task 2 (Evasivve Technique Classification)**

## df_full_test
### test + extra info

Same as test but with:
* clarity label
* evasion labels
* metadata

No direct use