# **STUDENT AI** - EDA
Exploratory Data Analysis

## Objectives

Conduct Exploratory analysis of the data to glean insights into how it might be used to solve the business case. Final objective is to identify significant variables to train the model on.

## Inputs

Cleaned Dataset from previous notebook.

## Outputs

None


### Install new libraries needed for EDA

---

# Import required libraries

In [6]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import ppscore as pps
import pingouin as pg
import pickle
import scipy.stats as stats
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from feature_engine import transformation as vt
from sklearn.pipeline import Pipeline

print('All Libraries Loaded')

# set default diagram size for matplotlib
plt.rcParams['figure.figsize'] = [3, 3]
colors = ['red', 'blue', 'green', 'orange', 'purple','yellow']

All Libraries Loaded


# Load Clean Dataset

### Set the working directory to notebook parent folder
If the output does not match, click **'clear all outputs'** and then **'restart'** the notebook. 
Then run cells from top to bottom.

In [None]:
current_dir = os.getcwd()
os.chdir(os.path.dirname(current_dir))
current_dir = os.getcwd()
print('If correct, Active Directory should read: /workspace/student-AI')
print(f"Active Directory: {current_dir}")

### Load cleaned dataset

In [None]:
df = pd.read_csv(f"outputs/dataset/Expanded_data_with_more_features_clean.csv")
df.head()

### Add mean score to be able to compare all 3 scores as a single value against the features

In [None]:
df['MeanScore'] = df[['MathScore', 'ReadingScore', 'WritingScore']].mean(axis=1).round().astype(int)
df.head()

## Individual Feature Analysis vs Target Variables (Math, Reading & Writing Score)

### Custom Function to save created plots to disk and overwrite any updates
Saving plots to disk allows later loading in streamlit dashboard 

In [None]:
from pandas.plotting import table

def save_plot(figure, filename, directory='outputs/images/plots'):
    # Create directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Define the full path
    filepath = os.path.join(directory, filename)

    # If a file with the same name already exists, remove it
    if os.path.isfile(filepath):
        os.remove(filepath)

    # Save the figure
    figure.savefig(filepath, bbox_inches='tight')

def save_df(df, filename, directory='outputs/images/plots'):
    # Create directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Define the full path
    filepath = os.path.join(directory, filename)

    # If a file with the same name already exists, remove it
    if os.path.isfile(filepath):
        os.remove(filepath)

    # Save the DataFrame using pickle
    with open(filepath, 'wb') as file:
        pickle.dump(df, file)

def save_analysis(text, filename, directory='outputs/images/plots'):
    # Create directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # Define the full path
    filepath = os.path.join(directory, filename)
    
    # If a file with the same name already exists, remove it
    if os.path.isfile(filepath):
        os.remove(filepath)
    
    # Save the text
    with open(filepath, 'w') as file:
        file.write(text)

In [None]:
# define custom function to plot categorical variables based on column name
def plot_category_distribution(column):
    
    palette = 'Set2'
    sns.set_palette(palette)
    sns.countplot(x=column, data=df)
    plt.title(f'Distribution of {column}')
    plt.xlabel('Categories')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')
    save_plot(plt, f'{column}-distribution.png')
    plt.show()

    # Create a figure and 4 subplots
    fig, axs = plt.subplots(1, 4, figsize=(12, 4))

    # Box plots
    sns.boxplot(ax=axs[0], x=column, y='MathScore', data=df)
    axs[0].set_title('Math Score')
    axs[0].set_xticklabels(axs[0].get_xticklabels(), rotation=45, ha='right')

    sns.boxplot(ax=axs[1], x=column, y='ReadingScore', data=df)
    axs[1].set_title('Reading Score')
    axs[1].set_xticklabels(axs[1].get_xticklabels(), rotation=45, ha='right')

    sns.boxplot(ax=axs[2], x=column, y='WritingScore', data=df)
    axs[2].set_title('Writing Score')
    axs[2].set_xticklabels(axs[2].get_xticklabels(), rotation=45, ha='right')
    
    sns.boxplot(ax=axs[3], x=column, y='MeanScore', data=df)
    axs[3].set_title('Mean Score')
    axs[3].set_xticklabels(axs[3].get_xticklabels(), rotation=45, ha='right')
    
    plt.tight_layout()
    save_plot(plt, f'{column}-boxplot.png')
    plt.show()

    # print scores compared to the given feature
    df_grouped = df.groupby([column])[['MathScore', 'ReadingScore', 'WritingScore', 'MeanScore']].agg(np.mean)
    df_sorted = df_grouped.sort_values(by='MeanScore', ascending=False)
    
    styled_df = df_sorted.style.background_gradient(cmap='Blues')
    display(styled_df)
    save_df(df_sorted, f'{column}-data.pkl')
    
    mean_max = df_sorted['MeanScore'].iloc[0]
    mean_min = df_sorted['MeanScore'].iloc[-1]
    spread = mean_max - mean_min
    spread_rounded = round(spread, 2)
    print(f'Mean Range: {spread_rounded}')



In [None]:
plot_category_distribution('Gender')

In [None]:
column = 'Gender'

analysis = f"""This shows that the expected distribution of {column} among 
        students.
        The mean shows that female students on average have better results,
        but specifically for Maths that fact is reversed.
        With a difference of only 3 percent average score, the effect of 
        {column} however, is **not** a major contributing factor to overall 
        score."""
         
save_analysis(analysis, f'{column}-analysis.txt')

#### The data is well balanced and shows that females have an overall better performance, but tend to have lower maths skills.
**I hypothesize that gender will not have a very large impact on the predicted score.**

In [None]:
plot_category_distribution('EthnicGroup')

#### This data is not balanced as group C far outnumbers group A ... this will need to be adjusted prior to training the models, as it would unfarely give group C a higher significance in predicting the student performance.

Group E shows the highest mean score and is significantly better performing in school.<br>
Group A shows the least performance with a mean score almost 10% lower.<br>

**I hypothesize that Ethnic Group should have a significant influence on the predicted score.**

In [None]:
column = 'EthnicGroup'

analysis = f"""Shows that group C is the clear majority in this specific school,
    and group A is the clear minority.
    However, Group E, the second minority, is significantly better
    than the other groups, while GroupA performs the worst.
    With a difference of 9 percent on average, it is clear that {column}
    has a significant influence on the overall score of a student."""
    
save_analysis(analysis, f'{column}-analysis.txt')

In [None]:
plot_category_distribution('ParentEduc')

#### This data is not balanced and will need to be adjusted prior to training the models, as it would unfarely give students with parents with some college higher emphasis in predicting the student performance.

Parents with a masters degree have the lowest frequency, but tend to have children that outperform all other students.

**I hypothesize that parental education should have a significant influence on the predicted score.**

In [None]:
column = 'ParentEduc'

analysis = """This shows that Parental Education plays a significant role in
 the students performance with a Mean spread of 10. Students who'se parents
have a masters degree on average are 10 percent better than students with 
parents who only completed high school."""
    
save_analysis(analysis, f'{column}-analysis.txt')

In [None]:
plot_category_distribution('LunchType')

#### This data is not balanced and will need to be adjusted prior to training the models as it will bias the results towards students who receive a standard lunch.

Students who receive a lunch from their parents (standard) are on average 10% better then students who do not. This is a very generalized indicator of economic well being of the family.<br>
the assumption can be made that parents who do not need to rely on free school lunch programs can afford not only a bespoke lunch meal, but also other benficiary activities such as purchasing study guides, better access to internet / PCs, tutoring etc.

**I hypothesize that LunchType should have a significant influence on the predicted score.**

In [None]:
column = 'LunchType'

analysis = f"""Interestingly the {column} seems to have an influence 
over the students performance. This most likely can be explained in terms of
socio-economic context - parents who do not need to rely on free lunches most
likely can also afford additional support for their children (eg tutors). They 
might also be in a position to supervise their childrens education more if 
they are not having to work significant hours to maintain the family."""
    
save_analysis(analysis, f'{column}-analysis.txt')

In [None]:
plot_category_distribution('TransportMeans')

This data is also not balanced, but shows it has no impact on the performance of the students.

In [None]:
column = 'TransportMeans'

analysis = """With a Mean spread of 0.07, whether a student takes the bus or
is privately driven to school has no bearing on their performance."""
    
save_analysis(analysis, f'{column}-analysis.txt')

In [None]:
plot_category_distribution('TestPrep')

#### This data is not balanced and will need to be adjusted prior to training the models as it will bias the results towards students who did not complete their test prep studies.

Faily obviously, students who completed the test preparation perform  on average 7% better in the end.

**I hypothesize that TestPrep should have an influence on the predicted score.**

In [None]:
column = 'TestPrep'

analysis = f"""Rather obviously whether or not a student completed their test
preparation has a 7 percent impact on their overall performance. I expect
{column} will have measuable impact on the predictive model."""
    
save_analysis(analysis, f'{column}-analysis.txt')

In [None]:
plot_category_distribution('ParentMaritalStatus')

#### This data is not balanced and will need to be adjusted prior to training the models as it will bias the results towards students with married parents.
Also, students with widowed parents seem to be an outlier and might skew the data somewhat.

Based on the mean score, no significant influence of parental marital status can be determined. After confirmation I will likely drop this feature.

**I hypothesize that ParentMaritalStatus should have an no influence on the predicted score.**

In [None]:
column = 'ParentMaritalStatus'

analysis = f"""{column} has no impact on student performance as can be seen 
from a mean range of 0.7"""
    
save_analysis(analysis, f'{column}-analysis.txt')

In [None]:
plot_category_distribution('PracticeSport')

#### This data is not balanced and will need to be adjusted prior to training the models as it will bias the results towards students who sometimes do sports.

Based on the mean score, only a minor impact of sports can be determined. After confirmation I will likely drop this feature, unless other features aslone are not sufficient.

**I hypothesize that PracticeSport should have only minor influence on the predicted score.**

In [None]:
column = 'PracticeSport'

analysis = f"""{column} has almost no impact on overall student performance
with a mean range of only **2.78**"""
    
save_analysis(analysis, f'{column}-analysis.txt')

In [None]:
plot_category_distribution('IsFirstChild')

#### This data is not balanced and will need to be adjusted prior to training the models as it will bias the results towards students who are the first child.

Based on the mean score, only a minor impact of being first child can be determined. After confirmation I will likely drop this feature, unless other features aslone are not sufficient.

**I hypothesize that IsFirstChild should have only minor influence on the predicted score.**

In [None]:
column = 'IsFirstChild'

analysis = f"""Whether or not a student {column} has almost no impact on 
overall student performance with a mean range of only **0.39**"""
    
save_analysis(analysis, f'{column}-analysis.txt')

In [None]:
plot_category_distribution('WklyStudyHours')

#### This data is not balanced and will need to be adjusted prior to training the models as it will bias the results towards students who study between 5-10 hours.

Based on the mean score, students who study more than 10h per week have on average 3% better score than student who study <5 hours. This is also not a very signigicant difference. A small biad towards mathematics can be seen 4% difference vs 2 or 3 .. indicating that particularly Maths benefits from extra study time.

**I hypothesize that StudyHours should have only minor influence on the predicted score.**

In [None]:
column = 'WklyStudyHours'

analysis = f"""With a mean range of only 3 percent, the weekly study hours of
as student surprisingly has very little influence on their overall performance
in the exam. Student who study more than 10 hours compared to ones that study
less than 5 hours only perform about 3 percent better on average.
That is really surprising."""
    
save_analysis(analysis, f'{column}-analysis.txt')

In [None]:
plot_category_distribution('NrSiblings')

#### This data is not balanced.

Based on the mean score, there is practically no difference between being an only child (0) or having 7 siblings ... this indicates that this feature has no bearing on the overall student performance and can be dropped.

**I hypothesize that NrSiblings has NO influence on the predicted score.**

In [None]:
column = 'NrSiblings'

analysis = f"""The number of siblings has no bearing on the students performace
most evident by the fact that according to the dataset, students with ZERO
or 7(!) siblings perform the same in the exam. This veriable will not be a
significant contributer to the model."""
    
save_analysis(analysis, f'{column}-analysis.txt')

## Assess distribution of numerical data

In [None]:
columns = ['MathScore', 'ReadingScore', 'WritingScore','MeanScore']
summaries_dict = {}

for column in columns:
    summary = df[column].describe()
    summaries_dict[column] = summary

# Convert the dictionary of summaries into a DataFrame
summaries_df = pd.DataFrame(summaries_dict)

# Display DataFrames
summaries_df

In [None]:
columns = ['MathScore', 'ReadingScore', 'WritingScore','MeanScore']
plt.figure(figsize=(14, 3))

for i, column in enumerate(columns, start=1):
    plt.subplot(1, 4, i) 
    sns.histplot(df[column], fill=True)
    plt.title(column)
    plt.xlabel('Score')
    plt.ylabel('Count')
    
    #add standard deviation to plots
    std_dev = df[column].std()
    plt.text(x=0.5, y=0.9, s=f'SD: {std_dev:.2f}', 
             horizontalalignment='center', verticalalignment='center', 
             transform=plt.gca().transAxes, fontsize=12)

save_plot(plt, f'Numerical-distribution.png')

# add box plots
fig, axs = plt.subplots(1, 4, figsize=(12, 3))
# Columns to plot
for i, col in enumerate(columns):
    ax = axs[i]
    sns.boxplot(x=df[col], ax=ax)
    ax.set_title(f"Boxplot ({col})")

plt.tight_layout()
save_plot(plt, f'Numerical-boxplots.png')
plt.show()

### Create a subplot of KDE diagrams to visualize density/concentration of data

In [None]:
plt.figure(figsize=(20, 5))

for i, column in enumerate(columns, start=1):
    plt.subplot(1, 4, i) 
    sns.kdeplot(df[column], fill=True)
    plt.title(column)
    plt.xlabel('Value')
    plt.ylabel('Density')

plt.tight_layout()
save_plot(plt, f'Numerical-kde.png')
plt.show()


### Visually the data seems to follow a standard 'bell curve' and appears normally distributed, which is very helpful for model fitting.
A standard deviation of 15 indicates the students performance are reasonably close to the mean grade. To further analyse the normal distribution I will plot QQ diagrams below:

In [None]:
plt.figure(figsize=(20, 5))

for i, column in enumerate(columns, start=1):
    plt.subplot(1, 4, i) 
    data = df[column]
    pg.qqplot(df[column], dist='norm')
    plt.title(column)
    plt.xlabel('Theoretical Quantiles')
    plt.ylabel('Ordered Quantiles')

plt.tight_layout()
save_plot(plt, f'Numerical-qq.png')
plt.show()

#### The QQ plots are very closely aligned to the diagonal indicating an very good normal distribution especially around the mean. The values deviate slightly at the ends indicating some possible outliers skewing the normality of the data. This will hopefully not influence any linear regression models to be applied but might need to be dealt with if accuracy becomes an issue.

The R2 value of 0.994+ is also very good indicator of normalness.

Testing for shapiro wilk, we get a dissappointing result:

In [None]:
shapiro_results = []

# Loop and perform the Shapiro-Wilk test
for column in columns:
    test_result = pg.normality(data=df[column], alpha=0.05)
    test_result['Variable'] = column
    shapiro_results.append(test_result)

# add to DataFrame
shapiro_results_df = pd.concat(shapiro_results, ignore_index=True)

shapiro_results_df

The Shapiro_wilk check seems to indicate that the data is not normal, despite the R2 values and visual confirmation of the Q-Q Plots. However, as the warninig suggests, the test is known to get more sensitive to even small deviations on very large datasets. For this reason, I will press on with the data preparation, under the assumption that data normalcy won't be an issue.