<h3><strong>Import Libraries</strong><h3>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import sys
import os

# Add parent directory to path to import src
sys.path.append(os.path.abspath('..'))
from src.data.cleaning import clean_data

<h3><strong>Import and Verify CSV</strong></h3>

In [None]:
# Read CSV
df = pd.read_csv('../dataset.csv')
print(df.head())

<h3><strong>Data Cleaning & Initial Inspection</strong></h3>

In [None]:
# Shape of dataframe
print(df.shape)

In [None]:
# Null Values
print(df.isnull().sum())

In [None]:
# Clean Data using Pipeline
df = clean_data(df)
print(df.head())

In [None]:
# Column Names
cols = df.columns
print(cols)

In [None]:
# Number of columns with numerical values
numerical_cols = df.select_dtypes(include=np.number).columns
print(numerical_cols)

In [None]:
# Data Types of Dataframe
print(df.dtypes)

<h3><strong>Exploratory Data Analysis</strong></h3>

In [None]:
# Summary Statistics
print(df.describe())

In [None]:
# Correlation Heatmap of all Values
plt.figure(figsize=(20, 20))
co_mtx = df.corr(numeric_only=True)
sns.heatmap(co_mtx, cmap='coolwarm', fmt='.2f', annot=True)
plt.title('Correlation Heatmap Between All Numerical Columns')
plt.show()

In [None]:
# Show Features with Correlation Above the Threshold (Probably Unecessary)
threshold = 0.4
filtered = (co_mtx.abs() > threshold) & (co_mtx.abs() < 1)
filtered_co_mtx = co_mtx.loc[filtered.any(axis=1), filtered.any(axis=0)]

plt.figure(figsize=(20, 20))
sns.heatmap(filtered_co_mtx, cmap='coolwarm', fmt='.3f', annot=True)
plt.title('Correlation Heatmap of Features with High Correlations')
plt.show()

In [None]:
# Correlation of All Values to Target Columns
target_co = co_mtx['Target']
target_co = target_co.drop('Target')
target_co = pd.DataFrame(target_co)

plt.figure(figsize=(5, 15))
sns.heatmap(target_co, cmap='coolwarm', annot=True)
plt.show()


In [None]:
# Distribution of Semester 1 Enrolled with Target hue
palette = {0: '#0072B2', 1: '#D55E00', 2: '#F0E442'}

plt.figure(figsize=(10, 10))
fig = sns.histplot(data=df, x='Curricular units 1st sem (enrolled)', hue='Target', palette=palette)
legend_elements = [
    Patch(facecolor='#0072B2', label='Dropout'),
    Patch(facecolor='#D55E00', label='Enrolled'),
    Patch(facecolor='#F0E442', label='Graduate')
]
fig.set_title('Distribution of Semester 1 Enrolled')
fig.legend(handles=legend_elements, title='Target', loc='upper right')
plt.show()

In [None]:
# Distribution of Semester 2 Enrolled with Target hue
palette = {0: '#0072B2', 1: '#D55E00', 2: '#F0E442'}

plt.figure(figsize=(10, 10))
fig = sns.histplot(data=df, x='Curricular units 2nd sem (enrolled)', hue='Target', palette=palette)
legend_elements = [
    Patch(facecolor='#0072B2', label='Dropout'),
    Patch(facecolor='#D55E00', label='Enrolled'),
    Patch(facecolor='#F0E442', label='Graduate')
]
fig.set_title('Distribution of Semester 2 Enrolled')
fig.legend(handles=legend_elements, title='Target', loc='upper right')
plt.show()

In [None]:
# Distribution of Semester 1 Grades with Target hue
palette = {0: '#0072B2', 1: '#D55E00', 2: '#F0E442'}

plt.figure(figsize=(10, 10))
fig = sns.histplot(data=df, x='Curricular units 1st sem (grade)', hue='Target', palette=palette)
legend_elements = [
    Patch(facecolor='#0072B2', label='Dropout'),
    Patch(facecolor='#D55E00', label='Enrolled'),
    Patch(facecolor='#F0E442', label='Graduate')
]
fig.set_title('Distribution of Semester 1 Grades')
fig.legend(handles=legend_elements, title='Target', loc='upper right')
plt.show()

In [None]:
# Distribution of Semester 2 Grades with Target hue
palette = {0: '#0072B2', 1: '#D55E00', 2: '#F0E442'}

plt.figure(figsize=(10, 10))
fig = sns.histplot(data=df, x='Curricular units 2nd sem (grade)', hue='Target', palette=palette)
legend_elements = [
    Patch(facecolor='#0072B2', label='Dropout'),
    Patch(facecolor='#D55E00', label='Enrolled'),
    Patch(facecolor='#F0E442', label='Graduate')
]
fig.set_title('Distribution of Semester 2 Grades')
fig.legend(handles=legend_elements, title='Target', loc='upper right')
plt.show()

In [None]:
# 1st SemesterGrade Distribution by Student Outcome
palette = ['#0072B2','#D55E00','#F0E442']

sns.boxplot(y='Target', x='Curricular units 1st sem (grade)', data=df, palette=palette, orient='h')
plt.title('1st Semester Grade Distribution by Student Outcome')
plt.xlabel('1st Semester Grade Units')
plt.yticks([0,1,2], ['Dropout', 'Enrolled', 'Graduate'])
plt.show()

In [None]:
# 2nd Semester Grade Distribution by Student Outcome
palette = ['#0072B2','#D55E00','#F0E442']

sns.boxplot(y='Target', x='Curricular units 2nd sem (grade)', data=df, palette=palette, orient='h')
plt.title('2nd Semester Grade Distribution by Student Outcome')
plt.xlabel('2nd Semester Grade Units')
plt.yticks([0,1,2], ['Dropout', 'Enrolled', 'Graduate'])
plt.show()

In [None]:
# Tuition fees up to date for each target class
palette = ['#0072B2','#D55E00','#F0E442']

fig = sns.countplot(x='Tuition fees up to date', data=df, hue='Target', palette=palette)
legend_elements = [
    Patch(facecolor='#0072B2', label='Dropout'),
    Patch(facecolor='#D55E00', label='Enrolled'),
    Patch(facecolor='#F0E442', label='Graduate')
]
fig.set_title('Tuition Fees Up-to-Date by Target Class')
fig.legend(handles=legend_elements, title='Target', loc='upper left')
plt.xlabel('Tuition Fees Up-to-Date')
plt.ylabel('Counts')
plt.xticks([0, 1], ['No', 'Yes'])