In [None]:
import re
import pandas as pd

## principles of mathematical analysis

In [None]:
def latex_formulas(file_path, formulas_path):
    # Load CSV files
    file_to_process = pd.read_csv(file_path)
    formulas_df = pd.read_csv(formulas_path)

    # Compile the pattern for matching digits in parentheses
    pattern = re.compile(r"\((\d+)\)")

    # Iterate over the theorems dataframe
    for i, row in file_to_process.iterrows():
        # Split the first column to get chapter and formula number
        chapter, _ = row[0].split('.')

        for col in file_to_process.columns[1:]:  # Skip the first column
            cell = str(row[col])  # Convert cell to string
            modified_cell = cell
            replaced_formulas = set()  # Keep track of replaced formula numbers

            for match in re.finditer(pattern, cell):
                formula_number = match.group(1)

                # Check if this formula number has already been processed
                if formula_number not in replaced_formulas:
                    # Find the matching formula in formulas_df
                    formula_row = formulas_df.loc[
                        (formulas_df['Chapter'] == int(chapter)) & (formulas_df['Formula#'] == int(formula_number))
                    ]

                    if not formula_row.empty:
                        # Get the LaTeX content and format it
                        latex_content = formula_row.iloc[0]['latex']
                        latex_content = latex_content.replace("$", "")
                        latex_content = "<$$" + latex_content + "$$>"

                        # Add the LaTeX formula behind the matched pattern
                        replacement = match.group(0) + " " + latex_content
                        modified_cell = modified_cell.replace(match.group(0), replacement)
                        replaced_formulas.add(formula_number)  # Mark this formula number as replaced

            # Update the cell in the dataframe
            file_to_process.at[i, col] = modified_cell

    # Save the modified dataframe to a new CSV file
    output_path = file_path.replace('.csv', '_modified.csv')
    file_to_process.to_csv(output_path, index=False)
    return output_path

folder_path = '../../training_data/principles_of_mathematical_analysis/'
file_paths = ['corollaries.csv', 'definitions.csv','propositions.csv', 'theorems_final.csv']
formulas_path = folder_path + 'formulas.csv'
for file in file_paths:
    file_path = folder_path + file
    modified_csv_path = latex_formulas(file_path, formulas_path)
    print("Modified CSV saved to:", modified_csv_path)


## real + functional
```
For each row in each file:
	Formula number for itself: num.num compare with Type column in formulas.csv
		none -> skip
		found -> replace
	Find 'Theorem', 'Definition', 'Remark', 'Proposition', 'Lemma' + num.num + (x)
        - there must be a num.num
        - x is optional
            - if x is a letter 
				found - replace
				not found - convert to num & replace
            - if there isn't x - replace with all matched cells
	Do the same thing with 'Theorem', 'Definition', 'Remark', 'Proposition', 'Lemma' + $num.num (x)$
```

In [None]:
# restructuring the formulas.csv file
folder_path = '../../training_data/functional_analysis/'
df = pd.read_csv(folder_path+'functional_formulas.csv')
def clean_path(path):
    # Remove the number at the beginning of the path
    path = path[4:]
    # Remove the underscore and everything after it
    path = path.split("_")[0]
    # Keep only the last word
    path = path.split(" ")[-1]
    return path

# Apply the clean_path function to the Path column
df["Path"] = df["Path"].apply(clean_path)
df = df.drop('Type#', axis=1)
df.to_csv(folder_path+"formulas.csv", index=True)
print("Data saved")

In [None]:
def replace_formulas_in_theorems(theorems_path, formulas_path):
    # Load CSV files
    theorems_df = pd.read_csv(theorems_path)
    formulas_df = pd.read_csv(formulas_path)

    # Compile the pattern for matching numbers in parentheses
    pattern = re.compile(r"\((\d+)\)")

    # Iterate over the theorems dataframe
    for i, row in theorems_df.iterrows():
        # Check if the theorem number is in the 'Type' column of formulas.csv
        formula = theorems_df.columns[0]
        if theorems_df.columns[0].startswith(formula) or formula in theorems_df.columns[0]:
            # Go through each column except the first one
            for col in theorems_df.columns[1:]:  # Skip the first column
                cell = str(row[col])
                modified_cell = cell
                replaced_formulas = set()  # Keep track of replaced formula numbers
                for match in re.finditer(pattern, cell):
                    formula_number = match.group(1)
                    # Check if this formula number has already been processed
                    if formula_number not in replaced_formulas:
                        # Find matching rows in formulas_df
                        matching_rows = formulas_df[(formulas_df['Formula#']==int(formula_number))]
                        if not matching_rows.empty:
                                # Get the LaTeX content and format it
                                latex_content = matching_rows.iloc[0]['latex']
                                latex_content = latex_content.replace("$", "")
                                latex_content = "<$$" + latex_content + "$$>"

                                # Add the LaTeX formula behind the matched pattern
                                replacement = match.group(0) + " " + latex_content
                                modified_cell = modified_cell.replace(match.group(0), replacement)
                                replaced_formulas.add(formula_number)  # Mark this formula number as replaced
                # Update the cell in the dataframe
                theorems_df.loc[i, col] = modified_cell

    modified_path = file_path.replace('.csv', '_modified.csv')
    theorems_df.to_csv(modified_path, index=False)

    return modified_path

In [None]:
pattern=re.compile(r"(Theorem|Definition|Remark|Proposition|Lemma)\s+\$\d+\.\d+\$\s+(?:\(([^)]+)\))?")
def replace_formulas_num2(theorems_path, formulas_path):

    theorems_df = pd.read_csv(theorems_path)
    formulas_df = pd.read_csv(formulas_path)

    formulas_df["Type"] = formulas_df["Type"].astype(str)
    formulas_df["Type#"] = formulas_df["Type#"].astype(str)
    formulas_df["Formula#"] = formulas_df["Formula#"].astype(str)

    for i, row in theorems_df.iterrows():
        for col in theorems_df.columns[1:]:
            cell = str(row[col])
            modified_cell = cell

            for match in re.finditer(pattern, cell):
                formula_type = match.group(2).strip()
                formula_number = match.group(3).strip()
                subnumber = match.group(5).strip() if match.group(5) else None

                matching_formulas = formulas_df[(formulas_df["Type#"].str.strip() == formula_type) &
                                                (formulas_df["Type"].str.strip() == formula_number)]

                if subnumber:
                    matching_formulas = matching_formulas[matching_formulas["Formula#"].str.strip() == subnumber]
                # No else block needed, as all formulas are included by default if subnumber is None

                replacement = match.group(0) + " <"
                if not matching_formulas.empty:
                    replacement += ",".join([f"<{formula}>" for formula in matching_formulas["latex"].tolist()])
                replacement += "> "
                modified_cell = modified_cell.replace(match.group(0), replacement)

            theorems_df.loc[i, col] = modified_cell

    modified_path = theorems_path
    theorems_df.to_csv(modified_path, index=False)

    return modified_path

In [None]:
def replace_formulas_num1(theorems_path, formulas_path):
    pattern = re.compile(r"((Theorem|Definition|Remark|Proposition|Lemma)[\s]+)(\d+\.\d+)(\s*\((\d+)\))?")

    theorems_df = pd.read_csv(theorems_path)
    formulas_df = pd.read_csv(formulas_path)

    formulas_df["Type"] = formulas_df["Type"].astype(str)
    formulas_df["Type#"] = formulas_df["Type#"].astype(str)
    formulas_df["Formula#"] = formulas_df["Formula#"].astype(str)

    for i, row in theorems_df.iterrows():
        for col in theorems_df.columns[1:]:
            cell = str(row[col])
            modified_cell = cell

            for match in re.finditer(pattern, cell):
                formula_type = match.group(2).strip()
                formula_number = match.group(3).strip()
                subnumber = match.group(5).strip() if match.group(5) else None

                matching_formulas = formulas_df[(formulas_df["Type#"].str.strip() == formula_type) &
                                                (formulas_df["Type"].str.strip() == formula_number)]

                if subnumber:
                    matching_formulas = matching_formulas[matching_formulas["Formula#"].str.strip() == subnumber]
                # No else block needed, as all formulas are included by default if subnumber is None

                replacement = match.group(0) + " <"
                if not matching_formulas.empty:
                    replacement += ",".join([f"<{formula}>" for formula in matching_formulas["latex"].tolist()])
                replacement += "> "
                modified_cell = modified_cell.replace(match.group(0), replacement)

            theorems_df.loc[i, col] = modified_cell

    modified_path = theorems_path
    theorems_df.to_csv(theorems_path, index=False)

    return modified_path


In [None]:
folder_paths = ['../../training_data/functional_analysis/', '../../training_data/real_and_complex_analysis/']
file_paths = ['corollaries.csv', 'definitions.csv','propositions.csv', 'theorems_final.csv']
formulas_path = folder_path + 'formulas.csv'
for folder_path in folder_paths:
    for file in file_paths:
        file_path = folder_path + file
        modified_csv_path = replace_formulas_in_theorems(file_path, formulas_path)
        modified_csv_path = replace_formulas_num1(modified_csv_path, formulas_path)
        modified_csv_path = replace_formulas_num2(modified_csv_path, formulas_path)
        print("Modified CSV saved to:", modified_csv_path)