# MSC Participants Data Cleaning Pipeline

### 1. Read CSV and show table layout

In [1]:
import pandas as pd

participants = pd.read_csv('./data/raw/participants.csv')

participants.head()

Unnamed: 0,Family ID,ContactName.last,ContactName.first,Email,Center,Address.city,Participants.name.last,Participants.name.first,Participants.gender,Age Group,Grade/Age Category,PAID,Decided
0,32,Rangaswamy,Sudharsan,yeomanly@gmail.com,cm_San_Jose,Saratoga,Rangaswamy,Sudharsan,M,Adult,36 to 59 years old,PAID,decided
1,32,Rangaswamy,Sudharsan,yeomanly@gmail.com,cm_San_Jose,Saratoga,Sudharsan,Usha,F,Adult,36 to 59 years old,PAID,decided
2,32,Rangaswamy,Sudharsan,yeomanly@gmail.com,cm_San_Jose,Saratoga,Sudharsan,Riti,F,Child,8,PAID,decided
3,32,Rangaswamy,Sudharsan,yeomanly@gmail.com,cm_San_Jose,Saratoga,Sudharsan,Tanvi,F,Child,2,PAID,decided
4,37,Pandya,Bela,bela.pandya@cmsj.org,cm_San_Jose,San Ramon,Pandya,Bela,F,Adult,36 to 59 years old,PAID,decided


### 2. Show contents of `PAID` and `Decided` columns

In [2]:
# Show unique values in PAID and Decided columns
print("Unique values in PAID column:")
print(participants['PAID'].unique())
print("\nUnique values in Decided column:") 
print(participants['Decided'].unique())


Unique values in PAID column:
['PAID' 'PARTLY PAID' 'NOT PAID']

Unique values in Decided column:
['decided' 'duplicate' 'undecided' 'cancelled' 'refunded'
 'partialdonation' 'fulldonation']


### Filter `PAID` and `Decided` columns for desired values

In [3]:
# Filter out rows where PAID is not "PAID" or "PARTLY PAID"
participants = participants[participants['PAID'].isin(['PAID', 'PARTLY PAID'])]

# Filter out rows where Decided is not "decided" 
participants = participants[participants['Decided'] == 'decided']

participants.head()


Unnamed: 0,Family ID,ContactName.last,ContactName.first,Email,Center,Address.city,Participants.name.last,Participants.name.first,Participants.gender,Age Group,Grade/Age Category,PAID,Decided
0,32,Rangaswamy,Sudharsan,yeomanly@gmail.com,cm_San_Jose,Saratoga,Rangaswamy,Sudharsan,M,Adult,36 to 59 years old,PAID,decided
1,32,Rangaswamy,Sudharsan,yeomanly@gmail.com,cm_San_Jose,Saratoga,Sudharsan,Usha,F,Adult,36 to 59 years old,PAID,decided
2,32,Rangaswamy,Sudharsan,yeomanly@gmail.com,cm_San_Jose,Saratoga,Sudharsan,Riti,F,Child,8,PAID,decided
3,32,Rangaswamy,Sudharsan,yeomanly@gmail.com,cm_San_Jose,Saratoga,Sudharsan,Tanvi,F,Child,2,PAID,decided
4,37,Pandya,Bela,bela.pandya@cmsj.org,cm_San_Jose,San Ramon,Pandya,Bela,F,Adult,36 to 59 years old,PAID,decided


### Confirm change in `PAID` and `Decided` columns

In [4]:
# Show unique values in PAID and Decided columns
print("Unique values in PAID column:")
print(participants['PAID'].unique())
print("\nUnique values in Decided column:") 
print(participants['Decided'].unique())


Unique values in PAID column:
['PAID' 'PARTLY PAID']

Unique values in Decided column:
['decided']


### View all columns

In [5]:
participants.columns

Index(['Family ID', 'ContactName.last', 'ContactName.first', 'Email', 'Center',
       'Address.city', 'Participants.name.last', 'Participants.name.first',
       'Participants.gender', 'Age Group', 'Grade/Age Category', 'PAID',
       'Decided'],
      dtype='object')

### 3. Drop Age Group column and confirm change

In [6]:
# Remove Age Group column
participants = participants.drop('Age Group', axis=1)

# Verify column removal
print("Columns after removing Age Group:")
print(participants.columns)


Columns after removing Age Group:
Index(['Family ID', 'ContactName.last', 'ContactName.first', 'Email', 'Center',
       'Address.city', 'Participants.name.last', 'Participants.name.first',
       'Participants.gender', 'Grade/Age Category', 'PAID', 'Decided'],
      dtype='object')


### View all values of `Grade/Age Category`

In [7]:
# Show unique values in Grade/Age Category column
print("Unique values in Grade/Age Category column:")
print(sorted(participants['Grade/Age Category'].unique()))


Unique values in Grade/Age Category column:
['1', '10', '11', '12', '2', '3', '30 to 35 years old', '36 to 59 years old', '4', '5', '6', '60 years and above', '7', '8', '9', 'CHYK Non-working', 'CHYK Working', 'Infant/ShishuVihar', 'KG', 'Pre-KG']


### Categorize CHYKs and older as Adults

In [8]:
# Convert values containing "years" or "CHYK" to "Adult" in Grade/Age Category column
participants['Grade/Age Category'] = participants['Grade/Age Category'].replace(
    {
        '36 to 59 years old': 'Adult',
        '60 years and above': 'Adult', 
        '30 to 35 years old': 'Adult',
        'CHYK Working': 'Adult',
        'CHYK Non-working': 'Adult'
    }
)

# Verify the changes
print("Unique values in Grade/Age Category column after conversion:")
print(sorted(participants['Grade/Age Category'].unique()))


Unique values in Grade/Age Category column after conversion:
['1', '10', '11', '12', '2', '3', '4', '5', '6', '7', '8', '9', 'Adult', 'Infant/ShishuVihar', 'KG', 'Pre-KG']


### Rename `Grade/Age Category` to `Category`

In [9]:
# Rename Grade/Age Category column to Category
participants = participants.rename(columns={'Grade/Age Category': 'Category'})

# Verify the column rename
print("Columns after renaming Grade/Age Category:")
print(participants.columns)


Columns after renaming Grade/Age Category:
Index(['Family ID', 'ContactName.last', 'ContactName.first', 'Email', 'Center',
       'Address.city', 'Participants.name.last', 'Participants.name.first',
       'Participants.gender', 'Category', 'PAID', 'Decided'],
      dtype='object')


### 4. Create empty `checkin` column

In [10]:
# Add a checkin column initialized to False
participants['checkin'] = ''


# Verify the new column
print("\nDataframe columns after adding checkin:")
print(participants.columns)



Dataframe columns after adding checkin:
Index(['Family ID', 'ContactName.last', 'ContactName.first', 'Email', 'Center',
       'Address.city', 'Participants.name.last', 'Participants.name.first',
       'Participants.gender', 'Category', 'PAID', 'Decided', 'checkin'],
      dtype='object')


### 5. Remove extra spaces and quotations

In [11]:
participants = participants.applymap(
    lambda x: x.strip() if isinstance(x, str) else x
    )

participants.head()

Unnamed: 0,Family ID,ContactName.last,ContactName.first,Email,Center,Address.city,Participants.name.last,Participants.name.first,Participants.gender,Category,PAID,Decided,checkin
0,32,Rangaswamy,Sudharsan,yeomanly@gmail.com,cm_San_Jose,Saratoga,Rangaswamy,Sudharsan,M,Adult,PAID,decided,
1,32,Rangaswamy,Sudharsan,yeomanly@gmail.com,cm_San_Jose,Saratoga,Sudharsan,Usha,F,Adult,PAID,decided,
2,32,Rangaswamy,Sudharsan,yeomanly@gmail.com,cm_San_Jose,Saratoga,Sudharsan,Riti,F,8,PAID,decided,
3,32,Rangaswamy,Sudharsan,yeomanly@gmail.com,cm_San_Jose,Saratoga,Sudharsan,Tanvi,F,2,PAID,decided,
4,37,Pandya,Bela,bela.pandya@cmsj.org,cm_San_Jose,San Ramon,Pandya,Bela,F,Adult,PAID,decided,


### 6. Save changes to cleaned CSV

In [12]:
output_path = "./data/cff_participants_cleaned.csv"
participants.to_csv(output_path, index=False)