In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyperclip
from tabulate import tabulate

In [2]:
kc_df_data_mcas = pd.read_csv('api/common_core_data/Data/2015-2023-MCAS-3-6.csv')
kc_df_data_gms8k = pd.read_csv('api/common_core_data/Test_Result/KC_question_level_gpt_35_turbo.csv')
notation_df = pd.read_csv('api/common_core_data/Knowledge_Base/Coherence_map_normal.csv')

In [3]:
kc_df_data_asdiv = pd.read_csv('api/common_core_data/Data/ASDiv-100-3rd.csv')

kc_df_data_asdiv_final = pd.DataFrame(columns=kc_df_data_asdiv.columns)

rows_to_append = []
for index, row in kc_df_data_asdiv.iterrows():
    kc = row['KC']
    kc_list = kc.split(',')
    kc_list = [kc.strip() for kc in kc_list]
    for kc in kc_list:
        new_row = row.copy()
        new_row['KC'] = kc
        rows_to_append.append(new_row)

kc_df_data_asdiv_final = pd.concat([kc_df_data_asdiv_final, pd.DataFrame(rows_to_append)], ignore_index=True)
kc_df_data_asdiv_final['KC'].value_counts()
kc_df_data_asdiv_final.to_csv('api/common_core_data/Data/ASDiv-100-3rd-final.csv', index=False)

In [4]:
# All ASDiv data for training and evaluating
kc_df_data_asdiv_1 = pd.read_csv('api/common_core_data/Data/ASDiv-100-1st-final.csv')
kc_df_data_asdiv_2 = pd.read_csv('api/common_core_data/Data/ASDiv-100-2nd-final.csv')
kc_df_data_asdiv_3 = pd.read_csv('api/common_core_data/Data/ASDiv-100-3rd-final.csv')
kc_df_data_asdiv = pd.concat([kc_df_data_asdiv_1, kc_df_data_asdiv_2, kc_df_data_asdiv_3], ignore_index=True)

kc_df_data_asdiv = kc_df_data_asdiv.rename(columns={'KC': 'KC applied'})
kc_df_data_asdiv = kc_df_data_asdiv.rename(columns={'Question_ID': 'id'})
# kc_df_data_asdiv['Grade applied'] = kc_df_data_asdiv['KC applied'].apply(lambda x: x.split('.')[0])
del kc_df_data_asdiv['Unnamed: 7']

In [5]:
kc_df_data_asdiv['KC applied'].value_counts()
# Draw this into a table using tabulate
table = tabulate(kc_df_data_asdiv['KC applied'].value_counts().reset_index(), tablefmt='pipe', headers=['KC', 'Count'])
print(table)

|    | KC        |   Count |
|---:|:----------|--------:|
|  0 | 2.OA.A.1  |      53 |
|  1 | 3.OA.A.3  |      35 |
|  2 | 4.NBT.B.6 |      26 |
|  3 | K.OA.A.2  |      23 |
|  4 | 1.OA.A.1  |      23 |
|  5 | 6.NS.B.4  |      22 |
|  6 | 4.MD.A.2  |      20 |
|  7 | 5.NBT.B.6 |      18 |
|  8 | 2.NBT.B.7 |      15 |
|  9 | 6.EE.B.6  |      15 |
| 10 | 6.RP.A.3  |      13 |
| 11 | 3.OA.A.8  |      10 |
| 12 | 4.NBT.B.5 |       6 |
| 13 | 6.RP.A.1  |       6 |
| 14 | 4.MD.A.3  |       6 |
| 15 | 4.OA.A.3  |       5 |
| 16 | 3.OA.D.8  |       5 |
| 17 | 8.EE.C.8  |       5 |
| 18 | 2.NBT.B.5 |       5 |
| 19 | 1.OA.A.2  |       4 |
| 20 | 4.NBT.B.4 |       4 |
| 21 | 3.MD.D.8  |       3 |
| 22 | 3.NBT.A.2 |       2 |
| 23 | 3.NBT.A.3 |       2 |
| 24 | 3.OA.D.9  |       2 |
| 25 | 2.NBT.B.6 |       1 |
| 26 | 3.NF.A.3  |       1 |
| 27 | 5.NBT.B.5 |       1 |
| 28 | 6.EE.C.9  |       1 |
| 29 | 3.OA.A.9  |       1 |
| 30 | 6.SP.B.5  |       1 |


In [6]:
kc_counts = kc_df_data_asdiv['KC applied'].value_counts()
small_kc = kc_counts[kc_counts < 5].index
# Take all asdiv_choosen_KC with larger than 
asdiv_choosen_KC = kc_df_data_asdiv[~kc_df_data_asdiv['KC applied'].isin(small_kc)]
asdiv_choosen_KC['KC applied'].value_counts()
choosen_kc = asdiv_choosen_KC['KC applied'].unique()
choosen_kc

array(['6.RP.A.1', '1.OA.A.1', '4.NBT.B.6', 'K.OA.A.2', '3.OA.A.8',
       '4.MD.A.3', '4.MD.A.2', '2.OA.A.1', '6.RP.A.3', '6.NS.B.4',
       '2.NBT.B.7', '6.EE.B.6', '3.OA.A.3', '5.NBT.B.6', '4.NBT.B.5',
       '4.OA.A.3', '8.EE.C.8', '2.NBT.B.5', '3.OA.D.8'], dtype=object)

In [7]:
# Change the column name from sub_code to full_code
for i in range(0, kc_df_data_mcas.shape[0]):
    kc = kc_df_data_mcas.iloc[i,:]['KC applied']
    if kc in notation_df['Sub Code'].values:
        print(f'KC: {kc} at index {i} is sub code format')
        kc_df_data_mcas.at[i, 'KC applied'] = notation_df[notation_df['Sub Code'] == kc ]['Full Code'].values[0]

KC: 3.MD.3 at index 0 is sub code format
KC: 3.OA.1 at index 1 is sub code format
KC: 3.MD.8 at index 2 is sub code format
KC: 3.OA.2 at index 3 is sub code format
KC: 3.NBT.2 at index 4 is sub code format
KC: 3.NBT.1 at index 5 is sub code format
KC: 3.MD.1 at index 6 is sub code format
KC: 3.MD.6 at index 7 is sub code format
KC: 3.OA.5 at index 8 is sub code format
KC: 3.G.2 at index 9 is sub code format
KC: 3.OA.7 at index 10 is sub code format
KC: 3.NBT.2 at index 11 is sub code format
KC: 3.NF.3 at index 12 is sub code format
KC: 3.OA.6 at index 13 is sub code format
KC: 3.G.2 at index 14 is sub code format
KC: 3.MD.7 at index 15 is sub code format
KC: 3.NF.2 at index 16 is sub code format
KC: 3.OA.4 at index 17 is sub code format
KC: 4.OA.3 at index 18 is sub code format
KC: 4.NBT.3 at index 19 is sub code format
KC: 4.OA.4 at index 20 is sub code format
KC: 4.NF.1 at index 21 is sub code format
KC: 4.NBT.5 at index 22 is sub code format
KC: 4.NF.2 at index 23 is sub code format

In [8]:
def format_question(row):
    return f"Question: {row['Question']}\nSolution: {row['Solution']}\nSteps: {row['Step']}"

def training_format_question(df):
    df['Formatted_Question'] = df.apply(format_question, axis=1)
    output = df[['id', 'Formatted_Question', 'KC applied', 'Source']]
    output.rename(columns={'Formatted_Question': 'Question'}, inplace=True)
    return output

In [9]:
asdiv_data = training_format_question(asdiv_choosen_KC)

filtered_kc_df_data_mcas = kc_df_data_mcas[kc_df_data_mcas['KC applied'].isin(choosen_kc)]
filtered_kc_df_data_mcas['Source'] = 'MCAS'

# Concat filtered_kc_df_data_mcas and kc_df_data_asdiv
kc_df_data = pd.concat([filtered_kc_df_data_mcas[['id','Question', 'KC applied', 'Source']], asdiv_data[['id', 'Question', 'KC applied', 'Source']]], ignore_index=True)
kc_df_data = kc_df_data.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Formatted_Question'] = df.apply(format_question, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output.rename(columns={'Formatted_Question': 'Question'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_kc_df_data_mcas['Source'] = 'MCAS'


In [10]:
from sklearn.model_selection import StratifiedShuffleSplit
data = kc_df_data.copy()
data['label'] = data['KC applied'].astype('category').cat.codes

asdiv_data = data[data['Source'] == 'ASDiv']
mcas_data = data[data['Source'] == 'MCAS']

asdiv_data = asdiv_data.reset_index(drop=True)
mcas_data = mcas_data.reset_index(drop=True)

In [11]:
asdiv_data

Unnamed: 0,id,Question,KC applied,Source,label
0,1666,Question: A small school has 55 students. If 1...,6.RP.A.1,ASDiv,15
1,1284,Question: Tom used 2 batteries on his flashlig...,1.OA.A.1,ASDiv,0
2,1145,Question: Collin baked 9 brownies with nuts on...,6.RP.A.1,ASDiv,15
3,1592,Question: There are 396 students going to a tr...,4.NBT.B.6,ASDiv,10
4,1366,Question: Joan is baking a cake. The recipe ca...,K.OA.A.2,ASDiv,18
...,...,...,...,...,...
306,1235,Question: Bianca wanted to drink exactly 5 bot...,K.OA.A.2,ASDiv,18
307,440,Question: If Lewis earns $1367.00 every week d...,4.NBT.B.5,ASDiv,9
308,1584,Question: Kaleb has to sell 710 chocolate bars...,4.NBT.B.6,ASDiv,10
309,1839,Question: A pet store took ten birds out of a ...,1.OA.A.1,ASDiv,0


In [12]:
# Create a mapping of categories to their numerical codes
category_mapping = dict(enumerate(data['KC applied'].astype('category').cat.categories))

# Convert the mapping to a DataFrame
mapping_df = pd.DataFrame(list(category_mapping.items()), columns=['label', 'KC applied'])

In [13]:
mapping_df

Unnamed: 0,label,KC applied
0,0,1.OA.A.1
1,1,2.NBT.B.5
2,2,2.NBT.B.7
3,3,2.OA.A.1
4,4,3.OA.A.3
5,5,3.OA.A.8
6,6,3.OA.D.8
7,7,4.MD.A.2
8,8,4.MD.A.3
9,9,4.NBT.B.5


# Train - Test split

In [14]:
asdiv_data

Unnamed: 0,id,Question,KC applied,Source,label
0,1666,Question: A small school has 55 students. If 1...,6.RP.A.1,ASDiv,15
1,1284,Question: Tom used 2 batteries on his flashlig...,1.OA.A.1,ASDiv,0
2,1145,Question: Collin baked 9 brownies with nuts on...,6.RP.A.1,ASDiv,15
3,1592,Question: There are 396 students going to a tr...,4.NBT.B.6,ASDiv,10
4,1366,Question: Joan is baking a cake. The recipe ca...,K.OA.A.2,ASDiv,18
...,...,...,...,...,...
306,1235,Question: Bianca wanted to drink exactly 5 bot...,K.OA.A.2,ASDiv,18
307,440,Question: If Lewis earns $1367.00 every week d...,4.NBT.B.5,ASDiv,9
308,1584,Question: Kaleb has to sell 710 chocolate bars...,4.NBT.B.6,ASDiv,10
309,1839,Question: A pet store took ten birds out of a ...,1.OA.A.1,ASDiv,0


In [15]:
seed = 42
# Initialize StratifiedShuffleSplit
grouped = asdiv_data.groupby('Question')

# Extract groups and their labels
groups = [group for _, group in grouped]
labels = [group['label'].iloc[0] for group in groups]

# Initialize StratifiedShuffleSplit
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

# Split the groups into training and test sets
for train_idx, test_valid_idx in splitter.split(groups, labels):
    train_groups = [groups[i] for i in train_idx]
    test_valid_groups = [groups[i] for i in test_valid_idx]

# Concatenate the groups back into DataFrames
train_set_asdiv = pd.concat(train_groups).reset_index(drop=True)
test_set_asdiv = pd.concat(test_valid_groups).reset_index(drop=True)



train_set = train_set_asdiv
test_set_mcas = mcas_data

# train_set = pd.concat([train_set, mcas_data], ignore_index=True)

train_set[['Question', 'label']].to_csv(f'KC_classification/data_first_ver/{seed}_train_set.csv', index=False)
test_set_asdiv[['Question', 'label']].to_csv(f'KC_classification/data_first_ver/{seed}_test_set_asdiv.csv', index=False)
test_set_mcas[['Question', 'label']].to_csv(f'KC_classification/data_first_ver/{seed}_test_set_mcas.csv', index=False)


In [16]:
train_set[['Question', 'label']]

Unnamed: 0,Question,label
0,Question: Paul got a box of 479 crayons for hi...,2
1,Question: At Jake's stables there are 10 stall...,16
2,Question: Sally memorized eight poems. After s...,18
3,Question: A restaurant offers diet soda and re...,0
4,Question: Mrs. Hilt reads 13 books on every da...,4
...,...,...
243,"Question: In Rina's desk drawer, there are 18 ...",16
244,Question: Adam had seventy-nine dollars saved ...,3
245,Question: It takes eight grams of plastic to m...,10
246,Question: Martha bought 18 small cakes. She ha...,4


In [17]:
from tabulate import tabulate
import pyperclip
import pandas as pd


# Get the value counts for 'KC applied' in train and test sets
train_asdiv_counts = train_set_asdiv['KC applied'].value_counts()
test_asdiv_counts = test_set_asdiv['KC applied'].value_counts()
mcas_counts = mcas_data['KC applied'].value_counts()



# Combine the counts into a single DataFrame
counts_df = pd.DataFrame({
    'Train': train_asdiv_counts,
    'Test(MCAS)': mcas_counts,
    'Test(ASDiv 20%)': test_asdiv_counts
}).fillna(0).astype(int)  # Fill NaNs with 0 and convert to integer type

# Sort the DataFrame by the 'Train' column in descending order
counts_df = counts_df.sort_values(by='Train', ascending=False)

# Reset index to facilitate merging
counts_df = counts_df.reset_index().rename(columns={'index': 'KC applied'})

# Merge with the mapping_df to add the label information
merged_df = counts_df.merge(mapping_df, on='KC applied')

# Sort the DataFrame by 'label' in ascending order
merged_df = merged_df.sort_values(by='label')

# Add a total row for numeric columns
total_row = merged_df[['Train', 'Test(MCAS)', 'Test(ASDiv 20%)']].sum().to_frame().T
total_row['KC applied'] = 'Total'
total_row['label'] = ''

# Append the total row to the merged DataFrame
merged_df = pd.concat([merged_df, total_row], ignore_index=True)

# Reorder the columns to place 'label' and 'KC applied' at the front
merged_df = merged_df[['label', 'KC applied', 'Train', 'Test(MCAS)', 'Test(ASDiv 20%)']]

# Drop the default index column
merged_df = merged_df.reset_index(drop=True)

# Format the DataFrame using tabulate
table = tabulate(merged_df, headers='keys', tablefmt='pipe', showindex=False)

# Print the table
print(table)


| label   | KC applied   |   Train |   Test(MCAS) |   Test(ASDiv 20%) |
|:--------|:-------------|--------:|-------------:|------------------:|
| 0       | 1.OA.A.1     |      17 |            0 |                 6 |
| 1       | 2.NBT.B.5    |       4 |            0 |                 1 |
| 2       | 2.NBT.B.7    |      12 |            0 |                 3 |
| 3       | 2.OA.A.1     |      42 |            0 |                11 |
| 4       | 3.OA.A.3     |      27 |            4 |                 8 |
| 5       | 3.OA.A.8     |       8 |            0 |                 2 |
| 6       | 3.OA.D.8     |       4 |            5 |                 1 |
| 7       | 4.MD.A.2     |      16 |            1 |                 4 |
| 8       | 4.MD.A.3     |       5 |            7 |                 1 |
| 9       | 4.NBT.B.5    |       5 |            5 |                 1 |
| 10      | 4.NBT.B.6    |      21 |            3 |                 5 |
| 11      | 4.OA.A.3     |       4 |            5 |             

In [18]:
merged_df[['label', 'KC applied']]
# Extract the mapping from 'label' to 'KC applied'
label_to_id = dict(zip(merged_df['label'], merged_df['KC applied']))

# Extract the mapping from 'KC applied' to 'label'
id_to_label = dict(zip(merged_df['KC applied'], merged_df['label']))

# Print the mappings
print("Label to ID Mapping:")
print(label_to_id)

print("\nID to Label Mapping:")
print(id_to_label)


Label to ID Mapping:
{0: '1.OA.A.1', 1: '2.NBT.B.5', 2: '2.NBT.B.7', 3: '2.OA.A.1', 4: '3.OA.A.3', 5: '3.OA.A.8', 6: '3.OA.D.8', 7: '4.MD.A.2', 8: '4.MD.A.3', 9: '4.NBT.B.5', 10: '4.NBT.B.6', 11: '4.OA.A.3', 12: '5.NBT.B.6', 13: '6.EE.B.6', 14: '6.NS.B.4', 15: '6.RP.A.1', 16: '6.RP.A.3', 17: '8.EE.C.8', 18: 'K.OA.A.2', '': 'Total'}

ID to Label Mapping:
{'1.OA.A.1': 0, '2.NBT.B.5': 1, '2.NBT.B.7': 2, '2.OA.A.1': 3, '3.OA.A.3': 4, '3.OA.A.8': 5, '3.OA.D.8': 6, '4.MD.A.2': 7, '4.MD.A.3': 8, '4.NBT.B.5': 9, '4.NBT.B.6': 10, '4.OA.A.3': 11, '5.NBT.B.6': 12, '6.EE.B.6': 13, '6.NS.B.4': 14, '6.RP.A.1': 15, '6.RP.A.3': 16, '8.EE.C.8': 17, 'K.OA.A.2': 18, 'Total': ''}


# Full training data

In [19]:
asdiv_data[['Question', 'label']].to_csv('KC_classification/data_second_ver/full_train_set_19.csv', index=False)

In [20]:
asdiv_data

Unnamed: 0,id,Question,KC applied,Source,label
0,1666,Question: A small school has 55 students. If 1...,6.RP.A.1,ASDiv,15
1,1284,Question: Tom used 2 batteries on his flashlig...,1.OA.A.1,ASDiv,0
2,1145,Question: Collin baked 9 brownies with nuts on...,6.RP.A.1,ASDiv,15
3,1592,Question: There are 396 students going to a tr...,4.NBT.B.6,ASDiv,10
4,1366,Question: Joan is baking a cake. The recipe ca...,K.OA.A.2,ASDiv,18
...,...,...,...,...,...
306,1235,Question: Bianca wanted to drink exactly 5 bot...,K.OA.A.2,ASDiv,18
307,440,Question: If Lewis earns $1367.00 every week d...,4.NBT.B.5,ASDiv,9
308,1584,Question: Kaleb has to sell 710 chocolate bars...,4.NBT.B.6,ASDiv,10
309,1839,Question: A pet store took ten birds out of a ...,1.OA.A.1,ASDiv,0


In [21]:
full_asdiv_data = training_format_question(kc_df_data_asdiv)
full_asdiv_data['label'] = full_asdiv_data['KC applied'].astype('category').cat.codes
full_asdiv_data[['Question', 'label']].to_csv('KC_classification/data_second_ver/full_train_set_31.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output.rename(columns={'Formatted_Question': 'Question'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_asdiv_data['label'] = full_asdiv_data['KC applied'].astype('category').cat.codes


In [22]:
# Create a mapping of categories to their numerical codes
category_mapping = dict(enumerate(full_asdiv_data['KC applied'].astype('category').cat.categories))

# Convert the mapping to a DataFrame
mapping_df = pd.DataFrame(list(category_mapping.items()), columns=['label', 'KC applied'])

id_to_label = dict(zip(mapping_df['KC applied'], mapping_df['label']))


print("\nID to Label Mapping:")
print(id_to_label)
pyperclip.copy(str(id_to_label))
id_to_label


ID to Label Mapping:
{'1.OA.A.1': 0, '1.OA.A.2': 1, '2.NBT.B.5': 2, '2.NBT.B.6': 3, '2.NBT.B.7': 4, '2.OA.A.1': 5, '3.MD.D.8': 6, '3.NBT.A.2': 7, '3.NBT.A.3': 8, '3.NF.A.3': 9, '3.OA.A.3': 10, '3.OA.A.8': 11, '3.OA.A.9': 12, '3.OA.D.8': 13, '3.OA.D.9': 14, '4.MD.A.2': 15, '4.MD.A.3': 16, '4.NBT.B.4': 17, '4.NBT.B.5': 18, '4.NBT.B.6': 19, '4.OA.A.3': 20, '5.NBT.B.5': 21, '5.NBT.B.6': 22, '6.EE.B.6': 23, '6.EE.C.9': 24, '6.NS.B.4': 25, '6.RP.A.1': 26, '6.RP.A.3': 27, '6.SP.B.5': 28, '8.EE.C.8': 29, 'K.OA.A.2': 30}


{'1.OA.A.1': 0,
 '1.OA.A.2': 1,
 '2.NBT.B.5': 2,
 '2.NBT.B.6': 3,
 '2.NBT.B.7': 4,
 '2.OA.A.1': 5,
 '3.MD.D.8': 6,
 '3.NBT.A.2': 7,
 '3.NBT.A.3': 8,
 '3.NF.A.3': 9,
 '3.OA.A.3': 10,
 '3.OA.A.8': 11,
 '3.OA.A.9': 12,
 '3.OA.D.8': 13,
 '3.OA.D.9': 14,
 '4.MD.A.2': 15,
 '4.MD.A.3': 16,
 '4.NBT.B.4': 17,
 '4.NBT.B.5': 18,
 '4.NBT.B.6': 19,
 '4.OA.A.3': 20,
 '5.NBT.B.5': 21,
 '5.NBT.B.6': 22,
 '6.EE.B.6': 23,
 '6.EE.C.9': 24,
 '6.NS.B.4': 25,
 '6.RP.A.1': 26,
 '6.RP.A.3': 27,
 '6.SP.B.5': 28,
 '8.EE.C.8': 29,
 'K.OA.A.2': 30}

In [23]:
kc_df_data_asdiv_4 = pd.read_csv('api/common_core_data/Data/ASDiv-100-4th-test.csv')
kc_df_data_asdiv_4 = kc_df_data_asdiv_4.rename(columns={'Question_ID': 'id'})
kc_df_data_asdiv_4['Formatted_Question'] = kc_df_data_asdiv_4.apply(format_question, axis=1)

asdiv_data_4 = kc_df_data_asdiv_4[['id','Formatted_Question', 'Source']]
asdiv_data_4 = asdiv_data_4.rename(columns={'Formatted_Question': 'Question'})
asdiv_data_4[['id','Question']].to_csv('KC_classification/data_second_ver/full_100_test_set.csv', index=False)

In [24]:
kc_df_data_asdiv_4 = pd.read_csv('api/common_core_data/Data/ASDiv-1000-5th.csv')
kc_df_data_asdiv_4 = kc_df_data_asdiv_4.rename(columns={'Question_ID': 'id'})
kc_df_data_asdiv_4['Formatted_Question'] = kc_df_data_asdiv_4.apply(format_question, axis=1)

asdiv_data_4 = kc_df_data_asdiv_4[['id','Formatted_Question', 'Source']]
asdiv_data_4 = asdiv_data_4.rename(columns={'Formatted_Question': 'Question'})
asdiv_data_4[['id','Question']].to_csv('KC_classification/data_second_ver/full_1000_test_set.csv', index=False)