# Functions defined

In [1]:
import pandas as pd

In [2]:
def split_solution_by_word_count(input_df, threshold=700):
    # Function to calculate the word count of a text
    def word_count(text):
        return len(text.split())

    # Copy the input DataFrame to avoid modifying the original DataFrame
    df = input_df.copy()

    # Calculate the word count for each row in the 'solution' column
    df['solution_word_count'] = df['solution'].apply(word_count)

    # Split the dataset based on the word count condition
    below_threshold = df[df['solution_word_count'] < threshold]
    above_or_equal_threshold = df[df['solution_word_count'] >= threshold]

    # Drop the 'solution_word_count' column if you no longer need it
    below_threshold.drop(columns=['solution_word_count'], inplace=True)
    above_or_equal_threshold.drop(columns=['solution_word_count'], inplace=True)

    return below_threshold, above_or_equal_threshold

# Math Dataset
## Use the variable MATH_below_700_words

In [3]:
# This is GSM8K and MATH
extra_dataset = pd.read_csv('/kaggle/input/aimo-external-dataset/external_df.csv')
# Separate into two DataFrames based on the 'source' column
MATH_df = extra_dataset[extra_dataset['source'] == 'MATH']
GSM8K_df = extra_dataset[extra_dataset['source'] == 'GSM8K']

print("Length of MATH_df:", len(MATH_df))
print("Length of GSM8K_df:", len(GSM8K_df))

Length of MATH_df: 12500
Length of GSM8K_df: 8792


In [4]:
MATH_df.head()

Unnamed: 0,problem,level,type,solution,stage,source
0,Kevin Kangaroo begins hopping on a number line...,Level 5,Algebra,Kevin hops $1/3$ of the remaining distance wit...,train,MATH
1,The ratio of the areas of two squares is $\fra...,Level 4,Algebra,We start off by simplifying the ratio $\frac{1...,train,MATH
2,"If $\sqrt{2\sqrt{t-2}} = \sqrt[4]{7 - t}$, the...",Level 4,Algebra,"We raise both sides to the fourth power, which...",train,MATH
3,Let $t(x) = \sqrt{3x+1}$ and $f(x)=5-t(x)$. Wh...,Level 4,Algebra,We first evaluate $f(5) = 5 -t(5) = 5-\sqrt{5\...,train,MATH
4,James has a total of 66 dollars in his piggy b...,Level 2,Algebra,Call the number of one dollar bills $x$ and th...,train,MATH


In [5]:
unique_links = MATH_df['type'].unique()
print(unique_links)
for link in unique_links:
    print(link)

['Algebra' 'Counting & Probability' 'Geometry' 'Intermediate Algebra'
 'Number Theory' 'Prealgebra' 'Precalculus']
Algebra
Counting & Probability
Geometry
Intermediate Algebra
Number Theory
Prealgebra
Precalculus


In [6]:
# Filtered to only use levels 4 and 5 
level_4_5_data = MATH_df[(MATH_df['level'] == 'Level 4') | (MATH_df['level'] == 'Level 5')]
other_levels_data = MATH_df[(MATH_df['level'] != 'Level 4') & (MATH_df['level'] != 'Level 5')]

print("Length of level_4_5_data:", len(level_4_5_data))

# Check the length of other_levels_data DataFrame
print("Length of other_levels_data:", len(other_levels_data))

Length of level_4_5_data: 6532
Length of other_levels_data: 5968


In [7]:
MATH_below_700_words, MATH_above_or_equal_700_words = split_solution_by_word_count(level_4_5_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  below_threshold.drop(columns=['solution_word_count'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  above_or_equal_threshold.drop(columns=['solution_word_count'], inplace=True)


In [8]:
print("Rows in below_700_words (solutions < 700 words):", len(MATH_below_700_words))
print("Rows in above_or_equal_700_words (solutions >= 700 words):", len(MATH_above_or_equal_700_words))

print("Words in first row of below_700_words' solution:", len(MATH_below_700_words.iloc[0]['solution'].split()))
print("Words in first row of above_or_equal_700_words' solution:", len(MATH_above_or_equal_700_words.iloc[0]['solution'].split()))

Rows in below_700_words (solutions < 700 words): 6526
Rows in above_or_equal_700_words (solutions >= 700 words): 6
Words in first row of below_700_words' solution: 91
Words in first row of above_or_equal_700_words' solution: 777


# Load Datasets
## Use the variable below_700_words

In [9]:
# https://www.kaggle.com/competitions/ai-mathematical-olympiad-prize/data

# Load the training data
train_data = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/train.csv')

# Load the test data
test_data = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/test.csv')

# Now you can view the first few rows of each to confirm they're loaded correctly
olympiad_data = pd.concat([train_data, test_data], ignore_index=True)



In [10]:
# Clean data
olympiad_data.dropna()
olympiad_data = olympiad_data.drop(['row_id'], axis=1)
#View 
olympiad_data.head()


Unnamed: 0,id,problem,answer
0,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52.0
1,246d26,Each of the three-digits numbers $111$ to $999...,250.0
2,2fc4ad,Let the `sparkle' operation on positive intege...,702.0
3,430b63,What is the minimum value of $5x^2+5y^2-8xy$ w...,800.0
4,5277ed,There exists a unique increasing geometric seq...,211.0


In [11]:
# No need to do regex
aime_data = pd.read_csv('/kaggle/input/aime-problem-set-1983-2024/AIME_Dataset_1983_2024.csv')
#Fix structure of columns
# rm Year, Problem Number, Part
aime_data = aime_data.drop(['Year', 'Problem Number', 'Part'], axis=1)
aime_data.dropna()
#rename Question: question, Answer:answer, ID:id
aime_data.rename(columns={'ID': 'id','Question': 'problem', 'Answer': 'answer'}, inplace=True)
#View
aime_data.head()


Unnamed: 0,id,problem,answer
0,1983-1,"Let $x$ , $y$ and $z$ all exceed $1$ and let $...",60
1,1983-2,"Let $f(x)=|x-p|+|x-15|+|x-p-15|$ , where $0 < ...",15
2,1983-3,What is the product of the real roots of the e...,20
3,1983-4,A machine-shop cutting tool has the shape of a...,26
4,1983-5,Suppose that the sum of the squares of two com...,4


In [12]:
#clean this
amio_24_data = pd.read_csv('/kaggle/input/aimo-24-processor-art-of-problem-solving/problems.csv')
#View all the different values of the only the links 
#take out unneeded data after each cell run 
# List of patterns to remove rows for
patterns_to_remove = ['AHSME', 'AJHSME', 'USOMO', 'USAMO', 'USAJMO', 'USOJMO']

# Create a boolean mask where True indicates that a row should be deleted
mask = amio_24_data['link'].str.contains('|'.join(patterns_to_remove))

# Invert the mask to keep rows that do not contain any of the patterns
amio_24_data = amio_24_data[~mask]

# Now, 'amio_24_data' contains only the rows where 'link' doesn't include the specified patterns
unique_links = amio_24_data['link'].unique()
print(unique_links)
#Fix structure of columns
#rm link, no
amio_24_data = amio_24_data.drop(['link', 'no'], axis=1)
#drop na
amio_24_data.dropna()
#View
amio_24_data.head()


['https://artofproblemsolving.com/wiki/index.php/2024_AMC_8_Problems/Problem_1'
 'https://artofproblemsolving.com/wiki/index.php/2024_AMC_8_Problems/Problem_3'
 'https://artofproblemsolving.com/wiki/index.php/2024_AMC_8_Problems/Problem_4'
 ...
 'https://artofproblemsolving.com/wiki/index.php/1983_AIME_Problems/Problem_13'
 'https://artofproblemsolving.com/wiki/index.php/1983_AIME_Problems/Problem_14'
 'https://artofproblemsolving.com/wiki/index.php/1983_AIME_Problems/Problem_15']


Unnamed: 0,id,problem,solution,answer
0,1,"What is the ones digit of \[222,222-22,222-2,2...","We can rewrite the expression as \[222,222-(22...",2
1,2,"What is the ones digit of \[222,222-22,222-2,2...","222,222-22,222 = 200,000\n200,000 - 2,222 = 19...",2
2,3,"What is the ones digit of \[222,222-22,222-2,2...","We only care about the unit's digits.\nThus, $...",2
3,4,"What is the ones digit of \[222,222-22,222-2,2...",We just take the units digit of each and subtr...,2
4,5,"Four squares of side length $4, 7, 9,$ and $10...",We work inwards. The area of the outer shaded ...,52


In [13]:
# clean this
amio_data = pd.read_csv('/kaggle/input/amio-parsed-art-of-problem-solving-website/parsed_ArtOfProblemSolving.csv')
amio_data.head()

patt_to_remove = ['AHSME', 'AJHSME', 'USOMO', 'USAMO', 'USAJMO', 'USOJMO']

# Create a boolean mask where True indicates that a row should be deleted
mask = amio_data['link'].str.contains('|'.join(patt_to_remove))

# Invert the mask to keep rows that do not contain any of the patterns
amio_data = amio_data[~mask]

# Now, 'amio_24_data' contains only the rows where 'link' doesn't include the specified patterns
unique_links = amio_data['link'].unique()
print(unique_links)
#Fix structure of columns
#rm link, letter
amio_data = amio_data.drop(['link', 'letter'], axis=1)
#change problem_id to id
amio_data.rename(columns={'problem_id': 'id'}, inplace=True)
#drop na
amio_data.dropna()
#View
amio_data.head()

['https://artofproblemsolving.com/wiki/index.php/2024_AMC_8_Problems/Problem_1'
 'https://artofproblemsolving.com/wiki/index.php/2024_AMC_8_Problems/Problem_2'
 'https://artofproblemsolving.com/wiki/index.php/2024_AMC_8_Problems/Problem_3'
 ...
 'https://artofproblemsolving.com/wiki/index.php/1983_AIME_Problems/Problem_13'
 'https://artofproblemsolving.com/wiki/index.php/1983_AIME_Problems/Problem_14'
 'https://artofproblemsolving.com/wiki/index.php/1983_AIME_Problems/Problem_15']


Unnamed: 0,id,problem,solution,answer
0,4ba30954e5f3ca72748b3e145f45b705,"What is the ones digit of \[222,222-22,222-2,2...","We can rewrite the expression as \[222,222-(22...",2.0
1,4ba30954e5f3ca72748b3e145f45b705,"What is the ones digit of \[222,222-22,222-2,2...","222,222-22,222 = 200,000\n200,000 - 2,222 = 19...",2.0
2,4ba30954e5f3ca72748b3e145f45b705,"What is the ones digit of \[222,222-22,222-2,2...","We only care about the unit's digits.\nThus, $...",2.0
3,4ba30954e5f3ca72748b3e145f45b705,"What is the ones digit of \[222,222-22,222-2,2...",We just take the units digit of each and subtr...,2.0
4,085955dda8dfb374689b3f216b54d785,What is the value of this expression in decima...,We see that $\frac{44}{11}$ is $4$ $\frac{110}...,6.54


In [14]:
combined_data = pd.concat([olympiad_data, aime_data, amio_data, amio_24_data], ignore_index=True)
print(f'Length beofre cleaning: {len(combined_data)}')
# Prioritize the rows that have 'solution' filled out 
combined_data_sorted = combined_data.sort_values(by='solution', ascending=False, na_position='last')
# Drop duplicates
df = combined_data_sorted.drop_duplicates(subset=['problem'], keep='first')
print(f'Length after cleaning: {len(df)}')

# Boolean indexing to filter rows where 'solution' column is not empty
non_empty_solution_rows = df[df['solution'].notnull()]
print(f"Length of those with a 'solution' value: {len(non_empty_solution_rows)}")

# Boolean indexing to filter rows where 'solution' column is empty
empty_solution_rows = df[df['solution'].isnull()]

# Display the length after cleaning
print(f"Length of those with no 'solution' value: {len(empty_solution_rows)}")

# Now 'empty_solution_rows' contains only rows where 'solution' column is empty


Length beofre cleaning: 15143
Length after cleaning: 3434
Length of those with a 'solution' value: 2656
Length of those with no 'solution' value: 778


In [15]:
df.head()

Unnamed: 0,id,problem,answer,solution
2439,ac275ec395b0170ff677d97a713b0cbf,"A frog sitting at the point $(1, 2)$ begins a ...",58,this is basically another version of solution ...
13457,5458,"There are real numbers $a, b, c,$ and $d$ such...",330,start off by applying vieta's and you will fin...
8494,4ff7758eeea95adc1120449999fce657,"Find $ax^5 + by^5$ if the real numbers $a,b,x,...",20,recurrence of the form $T_n=AT_{n-1}+BT_{n-2}$...
11260,3261,There are two values of $a$ for which the equa...,16,quadratic equation has exactly one root if and...
13944,5945,The positive integers $N$ and $N^2$ both end i...,937,let $N= 10000t+1000a+100b+10c+d$ for positive ...


In [16]:
empty_solution_rows.head()

Unnamed: 0,id,problem,answer,solution
0,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52.0,
1,246d26,Each of the three-digits numbers $111$ to $999...,250.0,
2,2fc4ad,Let the `sparkle' operation on positive intege...,702.0,
3,430b63,What is the minimum value of $5x^2+5y^2-8xy$ w...,800.0,
4,5277ed,There exists a unique increasing geometric seq...,211.0,


In [17]:
#If you want to save data
empty_solution_rows.to_csv("empty_solution_rows.csv")

In [18]:
below_700_words, above_or_equal_700_words = split_solution_by_word_count(non_empty_solution_rows)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  below_threshold.drop(columns=['solution_word_count'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  above_or_equal_threshold.drop(columns=['solution_word_count'], inplace=True)


In [19]:
print("Rows in below_700_words (solutions < 700 words):", len(below_700_words))
print("Rows in above_or_equal_700_words (solutions >= 700 words):", len(above_or_equal_700_words))

print("Words in first row of below_700_words' solution:", len(below_700_words.iloc[0]['solution'].split()))
print("Words in first row of above_or_equal_700_words' solution:", len(above_or_equal_700_words.iloc[0]['solution'].split()))

Rows in below_700_words (solutions < 700 words): 2650
Rows in above_or_equal_700_words (solutions >= 700 words): 6
Words in first row of below_700_words' solution: 352
Words in first row of above_or_equal_700_words' solution: 847


# MATH and the other datasets 

In [20]:
# Concatenate the two DataFrames
concatenated_df = pd.concat([MATH_below_700_words, below_700_words], ignore_index=True)
