In [1]:
import pandas as pd

In [2]:
kc_df_data_mcas = pd.read_csv('api/common_core_data/Data/2015-2023-MCAS-3-6.csv')
kc_df_data_gms8k = pd.read_csv('api/common_core_data/Test_Result/KC_question_level_gpt_35_turbo.csv')
notation_df = pd.read_csv('api/common_core_data/Knowledge_Base/Coherence_map_normal.csv')

In [3]:
notation_df[notation_df['Grade'] == 3]['Subject'].unique()

array(['Operations and Algebraic Thinking',
       'Number and Operations in Base Ten',
       'Number and Operations-Fractions', 'Measurement and Data',
       'Geometry'], dtype=object)

In [4]:
kc_df_data_gms8k['Grade applied'] = kc_df_data_gms8k['Grade applied'].fillna(0).apply(lambda x: int(x))

In [5]:
kc_df_data_gms8k[kc_df_data_gms8k['Grade applied'].isin([3, 4, 5, 6])][['Question','Grade applied', 'KCs applied','Source']].head(5)

Unnamed: 0,Question,Grade applied,KCs applied,Source
0,Natalia sold clips to 48 of her friends in Apr...,3,"['3.OA.A.3', '2.OA.A.1']",GSM8K-TRAIN
1,Weng earns $12 an hour for babysitting. Yester...,4,"['4.NF.C.6', '4.NF.B.4.c', '4.MD.A.2']",GSM8K-TRAIN
2,Betty is saving money for a new wallet which c...,3,"['3.OA.A.3', '2.NBT.B.5']",GSM8K-TRAIN
3,"Julie is reading a 120-page book. Yesterday, s...",3,"['3.OA.A.3', '2.OA.A.1', '3.NBT.A.2']",GSM8K-TRAIN
4,James writes a 3-page letter to 2 different fr...,4,"['3.OA.A.3', '4.NBT.B.5']",GSM8K-TRAIN


In [6]:
kc_df_data_mcas['Source'] = 'MCAS'
# Change the column name KC applied to KCs applied
kc_df_data_mcas.rename(columns={'KC applied': 'KCs applied'}, inplace=True)

kc_df_data_mcas[['Question', 'Grade applied', 'KCs applied','Source']].head(5)

Unnamed: 0,Question,Grade applied,KCs applied,Source
0,The graph below shows the numbers of laps four...,3,3.MD.3,MCAS
1,Ms. Garcia wrote the sentence shown in the box...,3,3.OA.1,MCAS
2,Ms. Shaw has a quilt that is in the shape of a...,3,3.MD.8,MCAS
3,Which of these sentences matches the expressio...,3,3.OA.2,MCAS
4,What is the missing number that makes the numb...,3,3.NBT.2,MCAS


In [7]:
len(kc_df_data_mcas)

585

In [8]:
# Combine the two dataframes
kc_df = pd.concat([kc_df_data_mcas[['Question', 'Grade applied', 'KCs applied','Source']], kc_df_data_gms8k[kc_df_data_gms8k['Grade applied'].isin([3, 4, 5, 6])][['Question', 'Grade applied', 'KCs applied','Source']]])
kc_df = kc_df.reset_index(drop=True)
kc_df['Grade applied'] = kc_df['Grade applied'].astype(str)

In [9]:
kc_df.to_csv('api/common_core_data/Data/Grade_data_Train.csv', index=False)

# Train - Test - Valid Split

In [10]:
data = pd.read_csv('api/common_core_data/Data/Grade_data_Train.csv')

In [11]:
from sklearn.model_selection import StratifiedShuffleSplit
data['label'] = data['Grade applied'].astype('category').cat.codes

seed = 50
# Initialize StratifiedShuffleSplit
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=seed)

# Split the data into training and remaining (test + validation)
for train_idx, test_valid_idx in splitter.split(data, data['label']):
    train_set = data.iloc[train_idx]
    test_set = data.iloc[test_valid_idx]

train_set[['Question', 'label']].to_csv(f'Grade_classification/data/{seed}_train_set.csv', index=False)
test_set[['Question', 'label']].to_csv(f'Grade_classification/data/{seed}_test_set.csv', index=False)


In [12]:
train_set.shape, test_set.shape

((551, 5), (62, 5))

In [14]:
from tabulate import tabulate
import pyperclip

# Get value counts for each set
train_counts = train_set['Grade applied'].value_counts()
test_counts = test_set['Grade applied'].value_counts()
# valid_counts = valid_set['Grade applied'].value_counts()

# Combine the counts into a single DataFrame
counts_df = pd.DataFrame({
    'Train': train_counts,
    'Test': test_counts,
}).fillna(0).astype(int)  # Fill NaNs with 0 and convert to integer type

# Sort the DataFrame by the 'Train' column in descending order
counts_df = counts_df.sort_values(by='Train', ascending=False)

counts_df.loc['Total'] = counts_df.sum()
# Format the DataFrame using tabulate
table = tabulate(counts_df, headers='keys', tablefmt='pipe')
# Format the DataFrame using tabulate
table = tabulate(counts_df, headers='keys', tablefmt='pipe')

# Print the table
print(table)

# Copy the table to the clipboard
pyperclip.copy(table)


| Grade applied   |   Train |   Test |
|:----------------|--------:|-------:|
| 3               |     140 |     16 |
| 6               |     139 |     15 |
| 4               |     138 |     16 |
| 5               |     134 |     15 |
| Total           |     551 |     62 |
