# baseline creation
This notebook is intended to create a baseline for the self-labeled dataset.

In [1]:
import pandas as pd
import random

In [2]:
# load data set

df = pd.read_csv('./../../datasets/baseline_creation.csv', delimiter = ";")

print(df)

                                      PDF-Name                       Topic  \
0                             ase_combined.pdf  Agile Software Engineering   
1                             ase_combined.pdf  Agile Software Engineering   
2                             ase_combined.pdf  Agile Software Engineering   
3                             ase_combined.pdf  Agile Software Engineering   
4                             ase_combined.pdf  Agile Software Engineering   
...                                        ...                         ...   
5734  it-security-all-slides_no_duplicates.pdf                 IT-Security   
5735  it-security-all-slides_no_duplicates.pdf                 IT-Security   
5736  it-security-all-slides_no_duplicates.pdf                 IT-Security   
5737  it-security-all-slides_no_duplicates.pdf                 IT-Security   
5738  it-security-all-slides_no_duplicates.pdf                 IT-Security   

      Page Number Marked for processing Includes Image Data Inc

In [4]:
# Boolean indexing to filter out specific values from 'feature_column'
filtered_df = df[(df['Marked for processing'] != 'No') & (~df['Marked for processing'].isna())]


# Display the filtered DataFrame
print(filtered_df)

                                      PDF-Name                       Topic  \
5                             ase_combined.pdf  Agile Software Engineering   
6                             ase_combined.pdf  Agile Software Engineering   
7                             ase_combined.pdf  Agile Software Engineering   
8                             ase_combined.pdf  Agile Software Engineering   
9                             ase_combined.pdf  Agile Software Engineering   
...                                        ...                         ...   
5734  it-security-all-slides_no_duplicates.pdf                 IT-Security   
5735  it-security-all-slides_no_duplicates.pdf                 IT-Security   
5736  it-security-all-slides_no_duplicates.pdf                 IT-Security   
5737  it-security-all-slides_no_duplicates.pdf                 IT-Security   
5738  it-security-all-slides_no_duplicates.pdf                 IT-Security   

      Page Number Marked for processing Includes Image Data Inc

In [5]:
# create set of possible question types

question_types = ["What are {}?", "What is {}?", "How is the definition of {}?", "How does {} work?", "How is {} defined?", "What is the idea of {}?", "What do you know about {}?"]

In [6]:
def create_baseline_question(array_of_strings, dataframe, loop_column_name):
    """
    Function to add the baseline question based on the title of the slide.

    Parameters:
        - array_of_strings (list): A list of strings containing placeholders.
        - dataframe (pd.DataFrame): The DataFrame to which the placeholder column will be added.
        - loop_column_name (str): The name of the column in the DataFrame used for the loop.

    Returns:
        - pd.DataFrame: The DataFrame with an additional Baseline Question column.
    """
    # Create an empty list to store the values for the baseline questions
    baseline_questions = []

    # Iterate through each row of the DataFrame
    for index, row in dataframe.iterrows():
        # Select a random placeholder string from the array
        random_placeholder = random.choice(array_of_strings)

        # Get the current value of the loop_column_name for this row
        loop_value = row[loop_column_name]

        # Combine the random placeholder string with the loop_value
        placeholder_value = random_placeholder.format(loop_value)

        # Append the placeholder_value to the list
        baseline_questions.append(placeholder_value)

    # Add the new placeholder column to the DataFrame
    dataframe['Baseline Question'] = baseline_questions

    return dataframe

In [7]:
df_baseline = create_baseline_question(question_types, filtered_df, "Title of the slide")

print(df_baseline)

                                      PDF-Name                       Topic  \
5                             ase_combined.pdf  Agile Software Engineering   
6                             ase_combined.pdf  Agile Software Engineering   
7                             ase_combined.pdf  Agile Software Engineering   
8                             ase_combined.pdf  Agile Software Engineering   
9                             ase_combined.pdf  Agile Software Engineering   
...                                        ...                         ...   
5734  it-security-all-slides_no_duplicates.pdf                 IT-Security   
5735  it-security-all-slides_no_duplicates.pdf                 IT-Security   
5736  it-security-all-slides_no_duplicates.pdf                 IT-Security   
5737  it-security-all-slides_no_duplicates.pdf                 IT-Security   
5738  it-security-all-slides_no_duplicates.pdf                 IT-Security   

      Page Number Marked for processing Includes Image Data Inc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['Baseline Question'] = baseline_questions


In [8]:
# store the dataframe to a csv file
df_baseline.to_csv("./../../datasets/baseline.csv", index=False)