In [None]:
############################################
# 1. IMPORT LIBRARIES & MOUNT (if needed)
############################################

import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("whitegrid")
plt.rcParams["figure.dpi"] = 100

We begin by importing the essential libraries used throughout the project:

pandas and numpy for data manipulation and numerical operations. random for picking random values or random choices if needed. seaborn and matplotlib for data visualization.

In [None]:
############################################
# 2. LOAD DATASET
############################################

missing_values = [
    "Not Available", "N/A", "na", "NaN", "nan",
    "NULL", "Unknown", "--", "not available",
    "unknown", "null", ""
]

df = pd.read_csv("/content/Final_corrupted_dataset.csv", na_values=missing_values)

# print("Initial dataset shape:", df.shape)
# df.head(10)

We define a list of missing_values placeholders, such as "Not Available", "N/A", and "". This ensures that when pandas reads the CSV, any cell matching these strings becomes a NaN (missing value). We specify the file_path to our CSV. pd.read_csv loads the data into a DataFrame df. The parameter na_values=missing_values tells pandas to treat those strings as missing. We then print the initial shape of the DataFrame to see how many rows/columns we have, and df.head(10) shows the first 10 rows so we can quickly inspect what the raw data looks like.

In [None]:
############################################
# 3. REMOVE DUPLICATES
############################################

df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
print("After removing duplicates, shape =", df.shape)

drop_duplicates scans the DataFrame for any exact duplicate rows and removes them. This prevents counting the same record multiple times. reset_index(drop=True) reassigns a new integer index from 0 to len(df)-1. We print the new shape to confirm how many rows remain after duplicates are removed. This helps keep the data unique and consistent.

In [None]:
############################################
# 4. QUICK INSPECTION
############################################

# print("\n--- INFO ---")
# df.info()

# print("\n--- DESCRIBE (include='all') ---")
# display(df.describe(include='all'))

# print("\n--- SAMPLE ROWS ---")
# display(df.sample(5))

df.info(): Summarizes each column’s data type, the number of non-null values, and the overall memory usage. This helps identify columns with missing data or incorrect data types. df.describe(include='all'): Gives a statistical summary for all columns—both numeric (mean, std, min, max) and categorical (count, unique, top, freq). df.sample(5): Displays 5 random rows so we can see the variety of entries. This can reveal unexpected placeholders or anomalies that might not appear in the first few rows.

In [None]:
############################################
# 5. CONVERT WRITTEN-OUT NUMBERS TO DIGITS
############################################

word_to_number_dict = {
    "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4,
    "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9,
    "ten": 10, "eleven": 11, "twelve": 12, "thirteen": 13,
    "fourteen": 14, "fifteen": 15, "sixteen": 16,
    "seventeen": 17, "eighteen": 18, "nineteen": 19
}
tens_dict = {
    "twenty": 20, "thirty": 30, "forty": 40, "fifty": 50,
    "sixty": 60, "seventy": 70, "eighty": 80, "ninety": 90
}

def words_to_numbers(value):
    """Convert English words (like 'sixty-five') to an integer if possible."""
    if not isinstance(value, str):
        return value  # If it's already numeric or NaN, return as-is

    word = value.strip().lower()
    # Direct 0-19
    if word in word_to_number_dict:
        return word_to_number_dict[word]
    # Direct tens (20,30,...90)
    if word in tens_dict:
        return tens_dict[word]
    # Compound numbers "twenty-three", "ninety-nine", etc.
    if "-" in word:
        parts = word.split("-")
        if len(parts) == 2:
            part1, part2 = parts
            if part1 in tens_dict and part2 in word_to_number_dict:
                return tens_dict[part1] + word_to_number_dict[part2]
    # Not recognized => return original
    return value

# Define numeric columns that might have textual numbers
possible_text_nums = [
    "Hours_Studied", "Sleep_Hours", "Previous_Scores",
    "Tutoring_Sessions", "Physical_Activity", "Exam_Score"
]

# Converting
for col in possible_text_nums:
    if col in df.columns:
        df[col] = df[col].apply(words_to_numbers)


Some numeric columns may contain textual representations (e.g., "sixty-five") that should be actual numbers (65). words_to_numbers tries to convert words like "twenty-three" into the correct integer. If it can’t, it returns the original value. We apply this function only to columns we suspect might have textual numeric data (like Hours_Studied, Sleep_Hours, etc.). This step ensures we don’t have strings like "eight" in numeric columns.

In [None]:
############################################
# 6. FORCE NUMERIC COLUMNS
############################################

# We expect these columns to be numeric
numeric_cols = [
    "Hours_Studied", "Attendance", "Sleep_Hours",
    "Previous_Scores", "Tutoring_Sessions",
    "Physical_Activity", "Exam_Score"  # If it exists
]

for col in numeric_cols:
    if col in df.columns:
        # Convert to numeric, coerce to NaN if unconvertible
        df[col] = pd.to_numeric(df[col], errors='coerce')


We define numeric_cols as columns that we want to be numeric. pd.to_numeric with errors='coerce' attempts to convert each entry to a float. If it can’t (e.g., leftover text), it becomes NaN. This step is critical if the dataset has inconsistent types. By the end, these columns are guaranteed to be numeric or missing.