In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Mount Google Drive (if running in Colab)
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset, handling missing values and inconsistencies
file_path = '/content/drive/MyDrive/Colab Notebooks/datasets/corrupted_datasetStudentPerformanceFactors.csv'
missing_values = ["--", "N/A", "NULL", "Not Available", "Unknown", ""]

try:
    df = pd.read_csv(file_path, na_values=missing_values)
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please check the path.")
    exit()  # Exit the script if the file isn't found

Initial Inspection

In [None]:
print("Original Data - Head:")
print(df.head())
print("\nOriginal Data - Info:")
df.info()
print("\nOriginal Data - Describe:")
print(df.describe(include='all'))
print("\nOriginal Data - Shape:")
print(df.shape)

Handle Missing Values and Inconsistencies

In [None]:
# Convert 'Hours_Studied' to numeric, coercing errors to NaN
df['Hours_Studied'] = pd.to_numeric(df['Hours_Studied'], errors='coerce')

# Convert 'Attendance' to numeric, coercing errors to NaN
df['Attendance'] = pd.to_numeric(df['Attendance'], errors='coerce')

# Convert 'Previous_Scores' to numeric, coercing errors to NaN
df['Previous_Scores'] = pd.to_numeric(df['Previous_Scores'], errors='coerce')

In [None]:
# Convert 'Family_Income' to numeric, coercing errors to NaN, and handling potential errors
try:
    df['Family_Income'] = pd.to_numeric(df['Family_Income'], errors='raise') #errors raise to make sure all errors are handled, see below
except ValueError as e:
    print(f"\nError converting 'Family_Income' to numeric: {e}")
    print("Unique non-numeric values in 'Family_Income':", df['Family_Income'].loc[pd.to_numeric(df['Family_Income'], errors='coerce').isna()].unique())

In [None]:
# Convert 'Family_Income' to numeric
df['Family_Income'] = df['Family_Income'].replace({'Low':1, 'Medium':2, 'High':3})

# Clean up 'Exam_Score'
df['Exam_Score'] = pd.to_numeric(df['Exam_Score'], errors='coerce')

# Deal with other missing values
df['Gender'] = df['Gender'].replace({'Male': 'MALE', 'Female': 'FEMALE'})

In [None]:
df.describe()

# Impute numerical missing values with the median
for col in [
    'Hours_Studied', 'Attendance', 'Previous_Scores',
    'Exam_Score', 'Sleep_Hours', 'Physical_Activity'
]:
    df[col] = df[col].fillna(df[col].median())

df.describe()

In [None]:
df.info()

for col in [
    'Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities',
    'Internet_Access', 'Tutoring_Sessions', 'School_Type','Peer_Influence',
    'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home',
    'Gender','Family_Income', 'Teacher_Quality', 'Motivation_Level'
]:
    df[col] = df[col].fillna(df[col].mode()[0])

df.info()