# SciQ Dataset After Categories Were Added

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

## Clean Dataset

In [2]:
#get the file
excel_file_path = 'SciQ_train_categories.xlsx'

# Read Excel file into a DataFrame
SciQ_train = pd.read_excel(excel_file_path)

In [3]:
SciQ_train

Unnamed: 0,I,Sub-Branch,Question,Correct Option,A,B,C,D
0,1,Microbiology,What type of organism is commonly used in prep...,C,gymnosperms,protozoa,mesophilic organisms,viruses
1,2,Atmospheric Science,What phenomenon makes global winds blow northe...,B,centrifugal effect,coriolis effect,muon effect,tropical effect
2,3,Thermodynamics,Changes from a less-ordered state to a more-or...,C,reactive,unbalanced,exothermic,endothermic
3,4,Nuclear Physics,What is the least dangerous radioactive decay?,B,gamma decay,alpha decay,beta decay,zeta decay
4,5,Volcanology,Kilauea in hawaii is the world’s most continuo...,C,carbon and smog,greenhouse gases,smoke and ash,magma
...,...,...,...,...,...,...,...,...
11674,11675,Biochemistry,The enzyme pepsin plays an important role in t...,D,protons,proteins,lipids,peptides
11675,11676,Nuclear Physics,What remains a constant of radioactive substan...,C,volatility,temperature,rate of decay,acidity
11676,11677,Ecology,"Terrestrial ecosystems, also known for their d...",D,monomes,bisomes,substrates,biomes
11677,11678,empty,High explosives create shock waves that exceed...,B,ion speed,supersonic,light speed,turbulence


In [4]:
#check duplicate questions
SciQ_train['Question'].duplicated().sum()

70

In [5]:
#remove duplicate questions
SciQ_train.drop_duplicates(subset=['Question'], inplace=True)

In [6]:
#verify
SciQ_train['Question'].duplicated().sum()

0

In [7]:
#reset index
SciQ_train.drop(columns=['I'], inplace=True)
SciQ_train.reset_index(drop=True, inplace=True)

In [10]:
SciQ_train #checking you out ;) 

Unnamed: 0,Sub-Branch,Question,Correct Option,A,B,C,D
0,Microbiology,What type of organism is commonly used in prep...,C,gymnosperms,protozoa,mesophilic organisms,viruses
1,Atmospheric Science,What phenomenon makes global winds blow northe...,B,centrifugal effect,coriolis effect,muon effect,tropical effect
2,Thermodynamics,Changes from a less-ordered state to a more-or...,C,reactive,unbalanced,exothermic,endothermic
3,Nuclear Physics,What is the least dangerous radioactive decay?,B,gamma decay,alpha decay,beta decay,zeta decay
4,Volcanology,Kilauea in hawaii is the world’s most continuo...,C,carbon and smog,greenhouse gases,smoke and ash,magma
...,...,...,...,...,...,...,...
11604,Biochemistry,The enzyme pepsin plays an important role in t...,D,protons,proteins,lipids,peptides
11605,Nuclear Physics,What remains a constant of radioactive substan...,C,volatility,temperature,rate of decay,acidity
11606,Ecology,"Terrestrial ecosystems, also known for their d...",D,monomes,bisomes,substrates,biomes
11607,empty,High explosives create shock waves that exceed...,B,ion speed,supersonic,light speed,turbulence


In [11]:
#check for 'empty' sub-branch 
(SciQ_train['Sub-Branch'] == 'empty').sum()

1400

In [12]:
#delete rows where Sub Branch is 'empty'
SciQ_train=SciQ_train[SciQ_train['Sub-Branch'] != 'empty']

In [13]:
SciQ_train

Unnamed: 0,Sub-Branch,Question,Correct Option,A,B,C,D
0,Microbiology,What type of organism is commonly used in prep...,C,gymnosperms,protozoa,mesophilic organisms,viruses
1,Atmospheric Science,What phenomenon makes global winds blow northe...,B,centrifugal effect,coriolis effect,muon effect,tropical effect
2,Thermodynamics,Changes from a less-ordered state to a more-or...,C,reactive,unbalanced,exothermic,endothermic
3,Nuclear Physics,What is the least dangerous radioactive decay?,B,gamma decay,alpha decay,beta decay,zeta decay
4,Volcanology,Kilauea in hawaii is the world’s most continuo...,C,carbon and smog,greenhouse gases,smoke and ash,magma
...,...,...,...,...,...,...,...
11603,Biochemistry,What is the major intracellular cation?,B,magnesium,potassium,sodium,glucose
11604,Biochemistry,The enzyme pepsin plays an important role in t...,D,protons,proteins,lipids,peptides
11605,Nuclear Physics,What remains a constant of radioactive substan...,C,volatility,temperature,rate of decay,acidity
11606,Ecology,"Terrestrial ecosystems, also known for their d...",D,monomes,bisomes,substrates,biomes


In [14]:
#check if it was done correctly
(SciQ_train['Sub-Branch'] == 'empty').sum()

0

In [15]:
SciQ_train['Sub-Branch'].isna().sum()

0

### Create column with Main Branch

In [16]:
#first: see all the sub branches and check for errors
SciQ_train['Sub-Branch'].unique()

array(['Microbiology', 'Atmospheric Science', 'Thermodynamics',
       'Nuclear Physics', 'Volcanology', 'Astronomy', 'Chemistry',
       'Evolutionary Biology', 'Botany', 'Physical Chemistry', 'Genetics',
       'Nutrition', 'Biochemistry', 'Inorganic Chemistry',
       'Anatomy/Physiology', 'Reproductive Biology',
       'Environmental Science', 'Scientific Method',
       'Molecular Chemistry', 'Geology', 'Zoology', 'Ecology',
       'Cell Biology', 'Agroecology', 'Mechanics', 'Optics', 'Geography',
       'Paleontology', 'Electricity', 'Organic Chemistry',
       'Molecular Biology', 'Health Science', 'Protistology', 'Taxonomy',
       'Sedimentology', 'Oceanography', 'Atomic Physics',
       'Electromagnetism', 'Medicine', 'Analytical Chemistry',
       'Neuroscience', 'Mineralogy', 'Particle Physics', 'Marine Biology',
       'Developmental Biology', 'Ornithology', 'Seismology',
       'Mycobiology', 'Electricity ', 'Mycology', 'Wave Mechanics',
       'Mathematics', 'Acoustics',

In [17]:
#second: check how many values each sub branch has
SciQ_train['Sub-Branch'].value_counts()

Anatomy/Physiology       1148
Inorganic Chemistry       861
Cell Biology              670
Biochemistry              580
Zoology                   567
Ecology                   567
Botany                    479
Genetics                  413
Geology                   369
Physical Chemistry        354
Mechanics                 303
Organic Chemistry         276
Astronomy                 274
Reproductive Biology      274
Thermodynamics            256
Geography                 250
Neuroscience              243
Electromagnetism          198
Microbiology              187
Evolutionary Biology      187
Atmospheric Science       177
Marine Biology            172
Environmental Science     140
Nuclear Physics           140
Optics                    126
Atomic Physics            121
Oceanography               93
Wave Mechanics             90
Scientific Method          89
Developmental Biology      69
Volcanology                63
Mycology                   61
Paleontology               58
Seismology

In [18]:
#third: fix errors and aggregate branches
#put ornithology in zoology 
SciQ_train['Sub-Branch'].replace(['Ornithology'], 'Zoology', inplace=True)

#put Protistology in Microbiology
SciQ_train['Sub-Branch'].replace(['Protistology'], 'Microbiology', inplace=True)

#put sedimentology, volcanology, paleontology, seismology, mineralogy in geology
SciQ_train['Sub-Branch'].replace(['Sedimentology', 'Volcanology', 'Paleontology', 'Paleaontology',
                                  'Seismology', 'Mineralogy'], 'Geology', inplace=True)

#put mycology and mycobiology and mricobiology in Microbiology
SciQ_train['Sub-Branch'].replace(['Mycology', 'Mycobiology', 'Mricobiology'], 'Microbiology', inplace=True)

#put wave mechanics in acoustics
SciQ_train['Sub-Branch'].replace(['Wave Mechanics'], 'Acoustics', inplace=True)

In [19]:
#join Nuclear and particle physics cuz they are thaught together
SciQ_train['Sub-Branch'].replace(['Nuclear Physics', 'Particle Physics'], 'Nuclear and Particle Physics', inplace=True)


In [20]:
#correct'nueroscience', 'atronomy',
SciQ_train['Sub-Branch'].replace(['Nueroscience'], 'Neuroscience', inplace=True)
SciQ_train['Sub-Branch'].replace(['Atronomy'], 'Astronomy', inplace=True)

In [21]:
#fourth: how many of each value do we have?
SciQ_train['Sub-Branch'].value_counts()

Anatomy/Physiology              1148
Inorganic Chemistry              861
Cell Biology                     670
Geology                          595
Biochemistry                     580
Zoology                          569
Ecology                          567
Botany                           479
Genetics                         413
Physical Chemistry               354
Mechanics                        303
Organic Chemistry                276
Astronomy                        275
Reproductive Biology             274
Thermodynamics                   256
Microbiology                     255
Geography                        250
Neuroscience                     244
Electromagnetism                 198
Nuclear and Particle Physics     187
Evolutionary Biology             187
Atmospheric Science              177
Marine Biology                   172
Environmental Science            140
Acoustics                        132
Optics                           126
Atomic Physics                   121
O

In [22]:
#fifth: remove categories with low count
#we will remove every question that does not have more then 90 questions in the same branch

value_counts = SciQ_train['Sub-Branch'].value_counts()

# Get the values that occur more than 90 times
valid_values = value_counts[value_counts > 90].index

# Filter the DataFrame to keep only rows with valid values
SciQ_filter = SciQ_train[SciQ_train['Sub-Branch'].isin(valid_values)]

In [23]:
SciQ_filter['Sub-Branch'].value_counts()

Anatomy/Physiology              1148
Inorganic Chemistry              861
Cell Biology                     670
Geology                          595
Biochemistry                     580
Zoology                          569
Ecology                          567
Botany                           479
Genetics                         413
Physical Chemistry               354
Mechanics                        303
Organic Chemistry                276
Astronomy                        275
Reproductive Biology             274
Thermodynamics                   256
Microbiology                     255
Geography                        250
Neuroscience                     244
Electromagnetism                 198
Evolutionary Biology             187
Nuclear and Particle Physics     187
Atmospheric Science              177
Marine Biology                   172
Environmental Science            140
Acoustics                        132
Optics                           126
Atomic Physics                   121
O

In [24]:
#since it worked, we will make the same to the original df
SciQ_train = SciQ_train[SciQ_train['Sub-Branch'].isin(valid_values)]

In [25]:
#testing
SciQ_train['Sub-Branch'].value_counts()

Anatomy/Physiology              1148
Inorganic Chemistry              861
Cell Biology                     670
Geology                          595
Biochemistry                     580
Zoology                          569
Ecology                          567
Botany                           479
Genetics                         413
Physical Chemistry               354
Mechanics                        303
Organic Chemistry                276
Astronomy                        275
Reproductive Biology             274
Thermodynamics                   256
Microbiology                     255
Geography                        250
Neuroscience                     244
Electromagnetism                 198
Evolutionary Biology             187
Nuclear and Particle Physics     187
Atmospheric Science              177
Marine Biology                   172
Environmental Science            140
Acoustics                        132
Optics                           126
Atomic Physics                   121
O

In [26]:
SciQ_train

Unnamed: 0,Sub-Branch,Question,Correct Option,A,B,C,D
0,Microbiology,What type of organism is commonly used in prep...,C,gymnosperms,protozoa,mesophilic organisms,viruses
1,Atmospheric Science,What phenomenon makes global winds blow northe...,B,centrifugal effect,coriolis effect,muon effect,tropical effect
2,Thermodynamics,Changes from a less-ordered state to a more-or...,C,reactive,unbalanced,exothermic,endothermic
3,Nuclear and Particle Physics,What is the least dangerous radioactive decay?,B,gamma decay,alpha decay,beta decay,zeta decay
4,Geology,Kilauea in hawaii is the world’s most continuo...,C,carbon and smog,greenhouse gases,smoke and ash,magma
...,...,...,...,...,...,...,...
11603,Biochemistry,What is the major intracellular cation?,B,magnesium,potassium,sodium,glucose
11604,Biochemistry,The enzyme pepsin plays an important role in t...,D,protons,proteins,lipids,peptides
11605,Nuclear and Particle Physics,What remains a constant of radioactive substan...,C,volatility,temperature,rate of decay,acidity
11606,Ecology,"Terrestrial ecosystems, also known for their d...",D,monomes,bisomes,substrates,biomes


In [27]:
#next: make lists with the sub branches of the same main branch 
#there will 5 main branches: chemistry, physics, biology, earth sciences and environmental science
chem=['Inorganic Chemistry', 'Physical Chemistry', 'Organic Chemistry']
physics=['Mechanics', 'Astronomy', 'Thermodynamics', 'Electromagnetism', 'Nuclear and Particle Physics', 'Acoustics',
        'Optics', 'Atomic Physics']
biology=['Anatomy/Physiology', 'Cell Biology', 'Biochemistry', 'Zoology', 'Ecology', 'Botany','Genetics',
         'Reproductive Biology', 'Neuroscience', 'Microbiology', 'Evolutionary Biology', 'Marine Biology']
earth_science=['Geology', 'Geography', 'Atmospheric Science', 'Oceanography']
environm_science=['Environmental Science']

In [28]:
#check if everything is listed 
#create mega list with all the values of the 5 lists created above
mega_list=chem+physics+biology+earth_science+environm_science
#organize it alphabetically
mega_list=sorted(mega_list)

#get list with values of sub-branch column
sub_branch= SciQ_train['Sub-Branch'].unique().tolist()
#organize them 
sub_branch=sorted(sub_branch)

In [29]:
#now compare the two
sub_branch==mega_list 

True

In [30]:
len(sub_branch)

28

In [31]:
#delete lists
del mega_list
del sub_branch

In [32]:
#Function to determine values for future column Main Branch based on column Sub-Branch
def assign_category(value):
    if value in chem:
        return 'Chemistry'
    elif value in biology:
        return 'Biology'
    elif value in physics:
        return 'Physics'
    elif value in earth_science:
        return 'Earth Sciences'
    else:
        return 'Environmental Science'

In [33]:
#apply function to dataframe
SciQ_train.loc[:, 'Main Branch']=SciQ_train['Sub-Branch'].apply(assign_category)

In [34]:
SciQ_train

Unnamed: 0,Sub-Branch,Question,Correct Option,A,B,C,D,Main Branch
0,Microbiology,What type of organism is commonly used in prep...,C,gymnosperms,protozoa,mesophilic organisms,viruses,Biology
1,Atmospheric Science,What phenomenon makes global winds blow northe...,B,centrifugal effect,coriolis effect,muon effect,tropical effect,Earth Sciences
2,Thermodynamics,Changes from a less-ordered state to a more-or...,C,reactive,unbalanced,exothermic,endothermic,Physics
3,Nuclear and Particle Physics,What is the least dangerous radioactive decay?,B,gamma decay,alpha decay,beta decay,zeta decay,Physics
4,Geology,Kilauea in hawaii is the world’s most continuo...,C,carbon and smog,greenhouse gases,smoke and ash,magma,Earth Sciences
...,...,...,...,...,...,...,...,...
11603,Biochemistry,What is the major intracellular cation?,B,magnesium,potassium,sodium,glucose,Biology
11604,Biochemistry,The enzyme pepsin plays an important role in t...,D,protons,proteins,lipids,peptides,Biology
11605,Nuclear and Particle Physics,What remains a constant of radioactive substan...,C,volatility,temperature,rate of decay,acidity,Physics
11606,Ecology,"Terrestrial ecosystems, also known for their d...",D,monomes,bisomes,substrates,biomes,Biology


In [35]:
#re-arrange column order
desired_order=['Question', 'A', 'B', 'C', 'D', 'Correct Option', 'Main Branch', 'Sub-Branch']
#apply the new (world) order!
SciQ_train = SciQ_train.loc[:, desired_order]

In [36]:
#reset index and sum 1
SciQ_train.reset_index(drop=True, inplace=True)
SciQ_train.index=SciQ_train.index+1 #since these are questions we start by number 1

In [37]:
SciQ_train

Unnamed: 0,Question,A,B,C,D,Correct Option,Main Branch,Sub-Branch
1,What type of organism is commonly used in prep...,gymnosperms,protozoa,mesophilic organisms,viruses,C,Biology,Microbiology
2,What phenomenon makes global winds blow northe...,centrifugal effect,coriolis effect,muon effect,tropical effect,B,Earth Sciences,Atmospheric Science
3,Changes from a less-ordered state to a more-or...,reactive,unbalanced,exothermic,endothermic,C,Physics,Thermodynamics
4,What is the least dangerous radioactive decay?,gamma decay,alpha decay,beta decay,zeta decay,B,Physics,Nuclear and Particle Physics
5,Kilauea in hawaii is the world’s most continuo...,carbon and smog,greenhouse gases,smoke and ash,magma,C,Earth Sciences,Geology
...,...,...,...,...,...,...,...,...
9898,What is the major intracellular cation?,magnesium,potassium,sodium,glucose,B,Biology,Biochemistry
9899,The enzyme pepsin plays an important role in t...,protons,proteins,lipids,peptides,D,Biology,Biochemistry
9900,What remains a constant of radioactive substan...,volatility,temperature,rate of decay,acidity,C,Physics,Nuclear and Particle Physics
9901,"Terrestrial ecosystems, also known for their d...",monomes,bisomes,substrates,biomes,D,Biology,Ecology


In [38]:
#create dataframe with only qustions and options (no right answer and no branches)
wanted_columns=['Question', 'A', 'B', 'C', 'D']
#create new df wit only wanted columns (dead or alive)
SciQ_questions_final=SciQ_train[wanted_columns]

In [39]:
#check results
SciQ_questions_final

Unnamed: 0,Question,A,B,C,D
1,What type of organism is commonly used in prep...,gymnosperms,protozoa,mesophilic organisms,viruses
2,What phenomenon makes global winds blow northe...,centrifugal effect,coriolis effect,muon effect,tropical effect
3,Changes from a less-ordered state to a more-or...,reactive,unbalanced,exothermic,endothermic
4,What is the least dangerous radioactive decay?,gamma decay,alpha decay,beta decay,zeta decay
5,Kilauea in hawaii is the world’s most continuo...,carbon and smog,greenhouse gases,smoke and ash,magma
...,...,...,...,...,...
9898,What is the major intracellular cation?,magnesium,potassium,sodium,glucose
9899,The enzyme pepsin plays an important role in t...,protons,proteins,lipids,peptides
9900,What remains a constant of radioactive substan...,volatility,temperature,rate of decay,acidity
9901,"Terrestrial ecosystems, also known for their d...",monomes,bisomes,substrates,biomes


## Export resulting Dataframes

In [40]:
#export the dataframes as csv files
#SciQ_train.to_csv('SciQ_train.csv',index=False)
#SciQ_questions_final.to_csv('SciQ_questions_final.csv',index=False)