## CommonSenseQA 

## Imports and Dataset Loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
file_path=r"C:\Users\Bia\OneDrive\Ambiente de Trabalho\TESEEE\DATASETS TESE\CommomSenseQA\train_rand_split.jsonl"

df_CSQA = pd.read_json(path_or_buf=file_path, lines=True)

## Re-Structure Dataset

In [3]:
#check df
df_CSQA

Unnamed: 0,answerKey,id,question
0,A,075e483d21c29a511267ef62bedc0461,"{'question_concept': 'punishing', 'choices': [..."
1,B,61fe6e879ff18686d7552425a36344c8,"{'question_concept': 'people', 'choices': [{'l..."
2,A,4c1cb0e95b99f72d55c068ba0255c54d,"{'question_concept': 'choker', 'choices': [{'l..."
3,D,02e821a3e53cb320790950aab4489e85,"{'question_concept': 'highway', 'choices': [{'..."
4,C,23505889b94e880c3e89cff4ba119860,"{'question_concept': 'fox', 'choices': [{'labe..."
...,...,...,...
9736,E,f1b2a30a1facff543e055231c5f90dd0,"{'question_concept': 'going public', 'choices'..."
9737,D,a63b4d0c0b34d6e5f5ce7b2c2c08b825,"{'question_concept': 'chair', 'choices': [{'la..."
9738,A,22d0eea15e10be56024fd00bb0e4f72f,"{'question_concept': 'jeans', 'choices': [{'la..."
9739,A,7c55160a4630de9690eb328b57a18dc2,"{'question_concept': 'well', 'choices': [{'lab..."


In [4]:
#remove id from df
df_CSQA.drop(columns=['id'], inplace=True)

In [5]:
#check for nulls
df_CSQA.isna().sum()

answerKey    0
question     0
dtype: int64

In [6]:
#check how question column values are
df_CSQA.loc[1, 'question']

{'question_concept': 'people',
 'choices': [{'label': 'A', 'text': 'race track'},
  {'label': 'B', 'text': 'populated areas'},
  {'label': 'C', 'text': 'the desert'},
  {'label': 'D', 'text': 'apartment'},
  {'label': 'E', 'text': 'roadblock'}],
 'stem': 'Sammy wanted to go to where the people were.  Where might he go?'}

In [7]:
#create a copy of the df to test modifications
df_test=df_CSQA.copy()

In [8]:
#create columns with question concept, choices and question (stem)
columns = ['Concept', 'Question', 'Choices']
for col in columns:
    df_test[col] = None

In [9]:
del columns

In [10]:
#see how it wen
df_test.head()

Unnamed: 0,answerKey,question,Concept,Question,Choices
0,A,"{'question_concept': 'punishing', 'choices': [...",,,
1,B,"{'question_concept': 'people', 'choices': [{'l...",,,
2,A,"{'question_concept': 'choker', 'choices': [{'l...",,,
3,D,"{'question_concept': 'highway', 'choices': [{'...",,,
4,C,"{'question_concept': 'fox', 'choices': [{'labe...",,,


In [11]:
#populate each new column with the values inside the 'question' dictionaries
#start with Concept
for index, row in df_test.iterrows():
    df_test.loc[index, 'Concept'] = df_test.loc[index, 'question']['question_concept']

In [12]:
#check if this worked
df_test.head()

Unnamed: 0,answerKey,question,Concept,Question,Choices
0,A,"{'question_concept': 'punishing', 'choices': [...",punishing,,
1,B,"{'question_concept': 'people', 'choices': [{'l...",people,,
2,A,"{'question_concept': 'choker', 'choices': [{'l...",choker,,
3,D,"{'question_concept': 'highway', 'choices': [{'...",highway,,
4,C,"{'question_concept': 'fox', 'choices': [{'labe...",fox,,


In [13]:
#since it worked, we do the same for the other 2 columns
for index, row in df_test.iterrows():
    df_test.loc[index, 'Question'] = df_test.loc[index, 'question']['stem']
    df_test.loc[index, 'Choices'] = df_test.loc[index, 'question']['choices']

In [14]:
#check again 
df_test.head()

Unnamed: 0,answerKey,question,Concept,Question,Choices
0,A,"{'question_concept': 'punishing', 'choices': [...",punishing,The sanctions against the school were a punish...,"[{'label': 'A', 'text': 'ignore'}, {'label': '..."
1,B,"{'question_concept': 'people', 'choices': [{'l...",people,Sammy wanted to go to where the people were. ...,"[{'label': 'A', 'text': 'race track'}, {'label..."
2,A,"{'question_concept': 'choker', 'choices': [{'l...",choker,To locate a choker not located in a jewelry bo...,"[{'label': 'A', 'text': 'jewelry store'}, {'la..."
3,D,"{'question_concept': 'highway', 'choices': [{'...",highway,Google Maps and other highway and street GPS s...,"[{'label': 'A', 'text': 'united states'}, {'la..."
4,C,"{'question_concept': 'fox', 'choices': [{'labe...",fox,"The fox walked from the city into the forest, ...","[{'label': 'A', 'text': 'pretty flowers.'}, {'..."


In [15]:
#check how values in Choices are
df_test.loc[1, 'Choices']

[{'label': 'A', 'text': 'race track'},
 {'label': 'B', 'text': 'populated areas'},
 {'label': 'C', 'text': 'the desert'},
 {'label': 'D', 'text': 'apartment'},
 {'label': 'E', 'text': 'roadblock'}]

In [16]:
#delete the question column
df_test.drop(columns=['question'], inplace=True)

In [17]:
#create columns with choices
columns_choices= ['A', 'B', 'C', 'D', 'E']
for col in columns_choices:
    df_test[col] = None

In [18]:
del columns_choices

In [19]:
#assign each option to the right column_ A, B, C, D or E
for index, row in df_test.iterrows():
    df_test.loc[index, 'A'] = df_test.loc[index, 'Choices'][0]['text']
    df_test.loc[index, 'B'] = df_test.loc[index, 'Choices'][1]['text']
    df_test.loc[index, 'C'] = df_test.loc[index, 'Choices'][2]['text']
    df_test.loc[index, 'D'] = df_test.loc[index, 'Choices'][3]['text']
    df_test.loc[index, 'E'] = df_test.loc[index, 'Choices'][4]['text']

In [20]:
#check if everything worked as planned
df_test.head()

Unnamed: 0,answerKey,Concept,Question,Choices,A,B,C,D,E
0,A,punishing,The sanctions against the school were a punish...,"[{'label': 'A', 'text': 'ignore'}, {'label': '...",ignore,enforce,authoritarian,yell at,avoid
1,B,people,Sammy wanted to go to where the people were. ...,"[{'label': 'A', 'text': 'race track'}, {'label...",race track,populated areas,the desert,apartment,roadblock
2,A,choker,To locate a choker not located in a jewelry bo...,"[{'label': 'A', 'text': 'jewelry store'}, {'la...",jewelry store,neck,jewlery box,jewelry box,boutique
3,D,highway,Google Maps and other highway and street GPS s...,"[{'label': 'A', 'text': 'united states'}, {'la...",united states,mexico,countryside,atlas,oceans
4,C,fox,"The fox walked from the city into the forest, ...","[{'label': 'A', 'text': 'pretty flowers.'}, {'...",pretty flowers.,hen house,natural habitat,storybook,dense forest


In [22]:
#check for nulls, check for duplicates
df_test.duplicated(subset=['Question']).sum() , df_test.isna().sum()

(0,
 answerKey    0
 Concept      0
 Question     0
 A            0
 B            0
 C            0
 D            0
 E            0
 dtype: int64)

In [21]:
#it worked so we will delete the Choices column
df_test.drop(columns=['Choices'], inplace=True)

In [23]:
#change 1st letter of Concept values to upper case - edit: no need 
#df_test['Concept'] = df_test['Concept'].str.capitalize()

In [24]:
#change name of answerKey to Correct Option
df_test.rename(columns={'answerKey': 'Correct Option'}, inplace=True)

In [25]:
#check value counts in concept
df_test['Concept'].value_counts()

Person              277
People              191
Human                83
Water                48
Cat                  44
                   ... 
Wing                  1
Sports                1
Route                 1
Commit to memory      1
Puzzle                1
Name: Concept, Length: 2151, dtype: int64

In [26]:
for value in df_test['Concept'].unique():
    print(value)

Punishing
People
Choker
Highway
Fox
Cable
Drawstring bag
Mold
Fountain pen
Restaurant
Grape
Getting divorce
Bench
Cooling off
House
Marsh
Grill
Illness
Pizzeria
Eating
Playing soccer
Back
Connection
Accelerator
Lying
Run errands
Rest
Snake
Stabbing to death
Dust
Express information
Canal
Camper
Paper
Thick
Sun
Seafood restaurant
President
Watching tv
Bald eagle
Free
Driving
Eating dinner
Chatting with friends
Sunshine
Skiing
Distance
Eating lunch
Mandatory
Person
Learning language
Killing people
Fiddle
Exhaustion
Money
Drugs
Beaver
Booze
Fungus
Attending school
Cut
Sex
Ball
Head
Call
Bookshop
Advertisement
Playing tennis
Toll road
Clown
Reduce
Losing consciousness
Meeting people
Metal rod
Communicating
Dog
Book
Pencils
Buying products
Getting paid
Possible
Meat
Cello
Bee
Getting drunk
Listening
Dream
Garden
Cat
Losing weight
Skin
Doing housework
Calculator
Waiter
Glad
Hear testimony
Temple
Folding chair
Geese
Punching
Small dog
Death
Staying in bed
Lizard
Food
Heart
Alcohol
Jar
Blowfis

Keyboard instrument
Fact
Bell
Address label
Calendar
Bookshelf
Corn
Dead body
Pretending
Homeowner
Conductor
Brick row house
Bus
Cup
Riding horse
Flawless
Row
Louisiana
Knowledge
Go on stage
Credit card
Shades
Apple
Cleaning house
Socks
Courthouse
Wheel
Valley
Bowl
Mirror
Die
Bath
Medicine
Socialising
Projectile ball
Playing basketball
Car park
Pillowcase
Going to work
Newspapers
Danger
Choose
Foot
Mine
Frogs
Chain
Make bread
Go to work
Escape
Drill
Side
Harp
Town
Turn
Apartment
Guns
Nerve
Wrestle
Smart
Bass
Parking area
Live
Continent
Buy house
Antibiotic
Sleep
Courtyard
Ceiling
Forest
Passengers
Plane
String
Oven
Bugle
Rocket launcher
Answer
Round brush
Corporeal
Eloquent
Terrace
Showroom
Live life
Below
Hair gel
Parent
Bigger
Positive
Anybody
Clean clothes
Washing machine
Chocolate
Eat ice cream
Taking final exams
Taking break
Drinks
Fast food restaurant
Slope
Referee
Eat hamburger
Crossing street
Kissing
Numbers
Car keys
Skate
Dirty
High rise
Laser
Write
Programs
Toothpaste
Bum
Pla

In [27]:
len(df_test['Concept'].unique())

2151

In [28]:
#concept column is useless so we will delete it 
df_test.drop(columns=['Concept'], inplace=True)

In [29]:
#check changes 
df_test.head()

Unnamed: 0,Correct Option,Question,A,B,C,D,E
0,A,The sanctions against the school were a punish...,ignore,enforce,authoritarian,yell at,avoid
1,B,Sammy wanted to go to where the people were. ...,race track,populated areas,the desert,apartment,roadblock
2,A,To locate a choker not located in a jewelry bo...,jewelry store,neck,jewlery box,jewelry box,boutique
3,D,Google Maps and other highway and street GPS s...,united states,mexico,countryside,atlas,oceans
4,C,"The fox walked from the city into the forest, ...",pretty flowers.,hen house,natural habitat,storybook,dense forest


In [30]:
#everything is alright so we will make a copy 
ComSenQA_train=df_test.copy()

In [31]:
del df_test

In [32]:
#re order columns
new_order=['Question','A','B','C','D','E', 'Correct Option']

ComSenQA_train=ComSenQA_train[new_order]

In [33]:
#check
ComSenQA_train.head()

Unnamed: 0,Question,A,B,C,D,E,Correct Option
0,The sanctions against the school were a punish...,ignore,enforce,authoritarian,yell at,avoid,A
1,Sammy wanted to go to where the people were. ...,race track,populated areas,the desert,apartment,roadblock,B
2,To locate a choker not located in a jewelry bo...,jewelry store,neck,jewlery box,jewelry box,boutique,A
3,Google Maps and other highway and street GPS s...,united states,mexico,countryside,atlas,oceans,D
4,"The fox walked from the city into the forest, ...",pretty flowers.,hen house,natural habitat,storybook,dense forest,C


In [34]:
#reset index and add 1
ComSenQA_train=ComSenQA_train.reset_index(drop=True)
#ComSenQA_train.index=ComSenQA_train.index + 1

In [35]:
#check
ComSenQA_train.head()

Unnamed: 0,Question,A,B,C,D,E,Correct Option
1,The sanctions against the school were a punish...,ignore,enforce,authoritarian,yell at,avoid,A
2,Sammy wanted to go to where the people were. ...,race track,populated areas,the desert,apartment,roadblock,B
3,To locate a choker not located in a jewelry bo...,jewelry store,neck,jewlery box,jewelry box,boutique,A
4,Google Maps and other highway and street GPS s...,united states,mexico,countryside,atlas,oceans,D
5,"The fox walked from the city into the forest, ...",pretty flowers.,hen house,natural habitat,storybook,dense forest,C


In [36]:
#create data set with no correct answer
ComSenQA_questions_final=ComSenQA_train[['Question', 'A', 'B', 'C', 'D', 'E']].copy()

In [37]:
#check
ComSenQA_questions_final.head()

Unnamed: 0,Question,A,B,C,D,E
1,The sanctions against the school were a punish...,ignore,enforce,authoritarian,yell at,avoid
2,Sammy wanted to go to where the people were. ...,race track,populated areas,the desert,apartment,roadblock
3,To locate a choker not located in a jewelry bo...,jewelry store,neck,jewlery box,jewelry box,boutique
4,Google Maps and other highway and street GPS s...,united states,mexico,countryside,atlas,oceans
5,"The fox walked from the city into the forest, ...",pretty flowers.,hen house,natural habitat,storybook,dense forest


In [38]:
#export these df in csv
#ComSenQA_train.to_csv('ComSenQA_train.csv',index=False)
#ComSenQA_questions_final.to_csv('ComSenQA_questions_final.csv',index=False)