# Clean the Messy Scores

## Context:
You’ve received a CSV-like list of student scores exported from an old system.
The data is messy: extra spaces, mixed types, missing values, and out-of-range numbers.

## Task:
Write a short Python script that cleans and summarizes the data.

In [30]:
import pandas as pd
import numpy as np

In [31]:
# Example dataset
raw_scores = [" 89", "95", "  102", "absent", "76 ", "", " 67", "NaN", " 59 ", "110", "83 "]

# Create a DataFrame
df = pd.DataFrame({'raw_score': raw_scores})

# Save to CSV
# df.to_csv('messy_scores.csv', index=False)

print("CSV file 'messy_scores.csv' created successfully.")
print(df.head())

CSV file 'messy_scores.csv' created successfully.
  raw_score
0        89
1        95
2       102
3    absent
4       76 


In [105]:
def clean_messy_scores(dataframe, score_column, verbose = True):
    clean_scores = []
    invalid =[]

    # Clean the data:
        #1. Remove spaces
        #2. Identify numbers
    
    for x in dataframe[score_column]:

        val = str(x).strip()

        try:
            num = float(val)

            if np.isnan(num):
                continue
                
            clean_scores.append(num)
            
        except ValueError:
            invalid.append(val)
            continue
        nan_count = ((len(dataframe[score_column]) - len(clean_scores)) - len(invalid))

    if verbose:
        print("\nDataset successfully cleaned!")
        print(f" • Clean scores: {clean_scores}")
        print(f" • Invalid values dropped: {invalid}")
        print(f" • NaN values skipped: {nan_count}")
        print("\nSummary statistics:")
        print(f"   Count : {len(clean_scores)}")
        print(f"   Mean  : {np.mean(clean_scores):.2f}")
        print(f"   Median: {np.median(clean_scores):.2f}")
        print(f"   Min   : {np.min(clean_scores)}")
        print(f"   Max   : {np.max(clean_scores)}")
        print('\nReturning all vallues\n')

    return {
        "clean_scores": clean_scores,
        "invalid": invalid,
        "nan_count": nan_count
    }

In [106]:
clean_messy_scores(df, 'raw_score')


Dataset successfully cleaned!
 • Clean scores: [89.0, 95.0, 102.0, 76.0, 67.0, 59.0, 110.0, 83.0]
 • Invalid values dropped: ['absent', '']
 • NaN values skipped: 1

Summary statistics:
   Count : 8
   Mean  : 85.12
   Median: 86.00
   Min   : 59.0
   Max   : 110.0

Returning all vallues



{'clean_scores': [89.0, 95.0, 102.0, 76.0, 67.0, 59.0, 110.0, 83.0],
 'invalid': ['absent', ''],
 'nan_count': 1}