In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Mount Google Drive (if running in Colab)
# from google.colab import drive
# drive.mount('/content/drive')

# Load the dataset, handling missing values and inconsistencies
file_path = '/content/corrupted_datasetStudentPerformanceFactors.csv'
missing_values = ["--", "N/A", "NULL", "Not Available", "Unknown", ""]

try:
    df = pd.read_csv(file_path, na_values=missing_values)
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please check the path.")
    exit()  # Exit the script if the file isn't found

1. Initial Inspection

In [None]:
print("Original Data - Head:")
print(df.head())
print("\nOriginal Data - Info:")
df.info()
print("\nOriginal Data - Describe:")
print(df.describe(include='all'))
print("\nOriginal Data - Shape:")
print(df.shape)

2. Data Cleaning

Handle Missing Values and Inconsistencies

In [None]:
# Convert 'Hours_Studied' to numeric, coercing errors to NaN
df['Hours_Studied'] = pd.to_numeric(df['Hours_Studied'], errors='coerce')

# Convert 'Attendance' to numeric, coercing errors to NaN
df['Attendance'] = pd.to_numeric(df['Attendance'], errors='coerce')

# Convert 'Previous_Scores' to numeric, coercing errors to NaN
df['Previous_Scores'] = pd.to_numeric(df['Previous_Scores'], errors='coerce')

In [None]:
# Convert 'Family_Income' to numeric, coercing errors to NaN, and handling potential errors
try:
    df['Family_Income'] = pd.to_numeric(df['Family_Income'], errors='raise') #errors raise to make sure all errors are handled, see below
except ValueError as e:
    print(f"\nError converting 'Family_Income' to numeric: {e}")
    print("Unique non-numeric values in 'Family_Income':", df['Family_Income'].loc[pd.to_numeric(df['Family_Income'], errors='coerce').isna()].unique())

In [None]:
# Convert 'Family_Income' to numeric
df['Family_Income'] = df['Family_Income'].replace({'Low':1, 'Medium':2, 'High':3})

# Clean up 'Exam_Score'
df['Exam_Score'] = pd.to_numeric(df['Exam_Score'], errors='coerce')

# Deal with other missing values
df['Gender'] = df['Gender'].replace({'Male': 'MALE', 'Female': 'FEMALE'})

In [None]:
df.describe()

# Impute numerical missing values with the median
for col in [
    'Hours_Studied', 'Attendance', 'Previous_Scores',
    'Exam_Score', 'Sleep_Hours', 'Physical_Activity'
]:
    df[col] = df[col].fillna(df[col].median())

df.describe()

In [None]:
df.info()

for col in [
    'Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities',
    'Internet_Access', 'Tutoring_Sessions', 'School_Type','Peer_Influence',
    'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home',
    'Gender','Family_Income', 'Teacher_Quality', 'Motivation_Level'
]:
    df[col] = df[col].fillna(df[col].mode()[0])

df.info()

In [None]:
#Print data types of each column
print(df.dtypes)

In [None]:
# Check the data types and unique values again
print("\nCleaned Data - Info:")
df.info()
print("\nCleaned Data - Head:")
print(df.head())

In [None]:
df.describe(include='all')
print(df.shape)

In [None]:
# Step 3: Feature Engineering
df['Parental_Involvement_Score'] = df['Parental_Involvement'].map({'Low': 1, 'Medium': 2, 'High': 3})
df['Parental_Education_Score'] = df['Parental_Education_Level'].map({'High School': 1, 'College': 2, 'Postgraduate': 3})
df['Combined_Parental_Score'] = df['Parental_Involvement_Score'] + df['Parental_Education_Score']

3. Exploratory Data Analysis (EDA)

Univariate Analysis

In [None]:
# Histograms for numerical features
numerical_cols = ['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Previous_Scores', 'Family_Income', 'Exam_Score','Combined_Parental_Score']
for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[col], kde=False) # Removed kde, can also do it with
    plt.title(f'Distribution of {col}')
    plt.show()

In [None]:
# Bar plots for categorical features
categorical_cols = ['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities',
                    'Internet_Access', 'Tutoring_Sessions', 'Teacher_Quality', 'School_Type',
                    'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level', 'Gender']
for col in categorical_cols:
    plt.figure(figsize=(6, 4))
    sns.countplot(x=col, data=df)
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Scatter plots against 'Exam_Score'
for col in numerical_cols:
    if col != 'Exam_Score':  # Avoid Exam_Score vs. Exam_Score
        plt.figure(figsize=(6, 4))
        sns.scatterplot(x=col, y='Exam_Score', data=df)
        plt.title(f'{col} vs. Exam_Score')
        plt.show()

In [None]:
# Box plots for categorical features against 'Exam_Score'
for col in categorical_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=col, y='Exam_Score', data=df)
    plt.title(f'{col} vs. Exam_Score')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Correlation matrix (for numerical features)
plt.figure(figsize=(8, 6))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/datasets/cleaned_student_performance.csv', index=False)

Code Explanation:

Import Libraries: Imports necessary libraries (pandas, numpy, matplotlib, seaborn).

Load Dataset:

Loads the corrupted_datasetStudentPerformanceFactors.csv file into a pandas DataFrame.

Uses na_values to specify a list of strings that should be treated as missing values (NaN). This is crucial for handling the various inconsistent missing value representations.

Prints the head(), info(), and describe() of the original DataFrame to show the initial state of the data. This helps you understand the columns, data types, and initial summary statistics (including missing value counts).

Data Cleaning:

pd.to_numeric(errors='coerce'): This is the key to handling the mixed data types and non-numeric entries in numerical columns.

errors='coerce' is essential. It tells pandas to convert anything that cannot be converted to a number into NaN. This is how we get rid of the text/symbols in the numerical columns.

After using pd.to_numeric(errors='coerce'), you'll have NaN values wherever there were strings or symbols.

Imputation (Missing Value Handling): After converting columns to numeric and handling inconsistent strings, you'll likely have more NaN values. The code shows how to:

Impute numerical columns with the median (more robust to outliers than the mean). df[col].fillna(df[col].median(), inplace=True) fills missing values in the column col with the median of that column.

Impute categorical columns with the mode (most frequent value). df[col].fillna(df[col].mode()[0], inplace=True) fills missing values in the column col with the most frequent value in that column. mode() returns a Series, so [0] is used to get the first (and usually only) mode.

Handling the inconsistent values in 'Gender': Used map function to make the strings consistent.

Feature Engineering:

Created a new column as combined_parental_involvement_education by combining Parental_Involvement and Parental_Education_Level.

Verification: df.info() and df.head() are used again to show the cleaned data. This is important for demonstrating the effect of your cleaning steps.

Exploratory Data Analysis (EDA):

Univariate Analysis: Creates histograms and count plots (bar plots for categorical variables) to visualize the distribution of each variable individually. This helps you understand:

Numerical: Shape (skewness, modality), central tendency (where is the "middle"), spread (how wide is the distribution), outliers.

Categorical: Frequency of each category, class imbalance.

Bivariate Analysis:

Scatter Plots: Show the relationship between each numerical feature and the 'Exam_Score'. Look for trends (positive, negative, non-linear).

Box Plots: Show the distribution of 'Exam_Score' for each category of a categorical variable. Look for differences in the median, spread, and presence of outliers across categories.

Correlation Matrix: Calculates and displays the correlation coefficients between all pairs of numerical features. This helps identify strong linear relationships (+1 = perfect positive, -1 = perfect negative, 0 = no linear relationship).

Save Cleaned Data (Optional): Saves the cleaned DataFrame to a new CSV file. This is useful for subsequent steps (like model building) without having to rerun the cleaning process every time.