In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('datasets/StudentsPerformance_modified.csv')

In [None]:
print(df.head(10).to_string())

In [None]:
print("(Rows, Cols) :", df.shape)

In [None]:
print("Total cells :", df.size)

In [None]:
print(df.dtypes)

In [None]:
print(df.info())

In [None]:
# Provides statistics for numerical columns
print(df.describe())

In [None]:
# Data type conversion

# converts to numeric, if any cell has non-numeric then fills it with NaN, then fill or drop the NaN cells
df['math score'] = pd.to_numeric(df['math score'], errors="coerce")
df.dropna(subset=["math score"], inplace=True)

# convert object to categorical 
df['gender'] = df['gender'].astype('category')
df['race/ethnicity'] = df['race/ethnicity'].astype('category')
df['lunch'] = df['lunch'].astype('category')

In [None]:
# Returns a same dataframe, but with true/false if null values present
df.isnull()

In [None]:
# Count of null values in each column
df.isnull().sum()

In [None]:
# Removing null values

# 1] Remove the entire row
df.dropna(inplace=True)

# 2] Replace with mean, median, mode
reading_score_mean = df['reading score'].mean()
writing_score_mean = df['writing score'].mean()
df['reading score'].fillna(reading_score_mean, inplace=True)
df['writing score'].fillna(writing_score_mean, inplace=True)

# 3] Using forward fill or backward fill
df.ffill(inplace=True)
df.bfill(inplace=True)

In [None]:
# Converting categorical variables to quantitative variables

# 1] One hot encoding - Creates binary columns for each category in a categorical variable, with 1s indicating the presence of a category and 0s indicating the absence.
df_encoded = pd.get_dummies(df, columns = ['gender', 'lunch', 'race/ethnicity'])
df_encoded.info()
df.head(5).to_string()

# 2] Label encoding - Label encoding assigns a unique integer to each category in a categorical variable.
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
df.head(5).to_string()

In [None]:
# Normalization

columns_to_normalize1 = ['math score']

# 1] MinMax normalization - It rescales the values of a numerical variable to a fixed range, typically between 0 and 1.
for column in columns_to_normalize1:
    df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
df.head(5).to_string()

columns_to_normalize2 = ['reading score']
# 2] Zscore normalization - It transforms the values of a numerical variable to have a mean of 0 and a standard deviation of 1.
for column in columns_to_normalize2:
    df[column] = (df[column] - df[column].mean()) / df[column].std()
df.head(5).to_string()