In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import os
import json

# INTERPRETABILITY AND ALGORITHMIC FAIRNESSES - Spotify Top Hit Playlist (2020-2021) 
The goal of this project is to apply most of the techniques presented during this course. <br>
(The pre-commit nbstripout has been installed to delete all notebook outputs before commiting)

# Load data from Kaggle

In [None]:
# Before running this line creata a Kaggle token, download the resulting kaggle.json and place the file in Users/"User"/.kaggle/kaggle.json 
!kaggle datasets download -d desalegngeb/students-exam-scores
!unzip students-exam-scores.zip
os.remove("students-exam-scores.zip")

# Import data

In [None]:
df = pd.read_csv("Expanded_data_with_more_features.csv", index_col=0)

In [None]:
os.remove("Original_data_with_more_rows.csv")
os.remove("Expanded_data_with_more_features.csv")

In [None]:
df.head()

# General info

In [None]:
# Shape 
df.shape

In [None]:
# Feature information
df.info()

In [None]:
# NaN values
df.isnull().sum()

In [None]:
# Statistical analysis of numerical features
df.describe()

# Data Cleaning

In [None]:
dfC = df.copy()

In [None]:
# Handle duplicates
duplicate_rows_data = dfC[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_data.shape)

In [None]:
# Loop through each column and count the number of distinct values
for column in dfC.columns:
    num_distinct_values = len(dfC[column].unique())
    print(f"{column}: {num_distinct_values} distinct values")

In [None]:
# Mapping the Studyhours
study_mapping = {
    '< 5': 'Less than 5 hours',
    '5 - 10': 'Between 5-10 hours',
    '> 10': 'More than 10 hours'
}

# Mapping the IsFirstChild
value_mapping = {
    'no': 0,
    'yes': 1
}

# Mapping the TestPrep
test_mapping = {
    'none': 0,
    'completed': 1
}

# Mapping the Schoolbus
bus_mapping = {
    'private': 0,
    'school_bus': 1
}

# Fixing the values in the column
dfC['WklyStudyHours'] = dfC['WklyStudyHours'].map(study_mapping)
dfC['IsFirstChild'] = dfC['IsFirstChild'].map(value_mapping)
dfC['TestPrep'] = dfC['TestPrep'].map(test_mapping)
dfC['TransportMeans'] = dfC['TransportMeans'].map(bus_mapping)

# Rename the column from 'education' to 'degree'
dfC.rename(columns={'TransportMeans': 'School_Bus'}, inplace=True)

In [None]:
# Treatment of missing values
# Interpolate for numericial value
dfC['NrSiblings'] = dfC['NrSiblings'].fillna(dfC['NrSiblings'].mode()[0])

# Use Mode for categoricial columns
dfC['EthnicGroup'] = dfC['EthnicGroup'].fillna(dfC['EthnicGroup'].mode()[0])
dfC['WklyStudyHours'] = dfC['WklyStudyHours'].fillna(dfC['WklyStudyHours'].mode()[0])
dfC['ParentEduc'] = df['ParentEduc'].fillna(df['ParentEduc'].mode()[0])
dfC['ParentMaritalStatus'] = dfC['ParentMaritalStatus'].fillna(dfC['ParentMaritalStatus'].mode()[0])
# Use Mode for binary columns
dfC['IsFirstChild'] = dfC['IsFirstChild'].fillna(dfC['IsFirstChild'].mode()[0])
dfC['PracticeSport'] = dfC['PracticeSport'].fillna(dfC['PracticeSport'].mode()[0])
dfC['TestPrep'] = dfC['TestPrep'].fillna(dfC['TestPrep'].mode()[0])
dfC['School_Bus'] = dfC['School_Bus'].fillna(dfC['School_Bus'].mode()[0])

# Preprocessing

In [None]:
dfP = dfC.copy()

In [None]:
dfP["Grade"] = (dfP["WritingScore"] + dfP["ReadingScore"] + dfP["MathScore"])/3
dfP.drop(columns=["WritingScore", "ReadingScore", "MathScore"], inplace=True)

In [None]:
# Encode grade into binary variable
dfP["Grade"] = (dfP["Grade"] > dfP["Grade"].quantile(0.75)).astype(int)

# EDA

In [None]:
# Create a figure and four subplots
fig, axs = plt.subplots(2, 2, figsize=(10, 8))

# Iterate over the columns and create the distribution plots
columns = ['Gender', 'EthnicGroup', 'ParentEduc', 'LunchType']
for i, col in enumerate(columns):
    ax = axs[i//2, i%2]
    dfP[col].value_counts().plot(kind='bar', ax=ax)
    ax.set_title(f"Distribution of {col}")

# Adjust the spacing between subplots
plt.tight_layout()

# Display the merged graph
plt.show()

Takeaways: <br>
(1) Data set is balanced regarding gender <br>
(2) Data set is unbalanced regarding ethnicity. More than one third of the students stem from ethnicity group C while only about 10% stem from ethnicity group A <br>
(3) Data set is unbalanced regarding parent education. More than 30% of the students have parents that went to some college while only about 7% of students come from parents with a Master's degree. (Assumption only highest degree is counted) <br>
(4) Data set is unbalanced regarding lunch type. About two third of the students have standard lunches while only one third of students benefit from free/reduced lunches (Context: Low-income children are eligible to receive reduced-price or free meals at school)


In [None]:
# Create a figure
fig, axs = plt.subplots(figsize=(12, 4))

# Create the bar plot
bar_plot = sns.countplot(data=dfP, x='Grade', ax=axs)
axs.set_title('Count of Each Grade')

# Annotate the count on top of every bar
for p in bar_plot.patches:
    bar_plot.annotate(f'{p.get_height()}', 
                      (p.get_x() + p.get_width() / 2., p.get_height()), 
                      ha='center', 
                      va='center', 
                      xytext=(0, 5), 
                      textcoords='offset points')

# Display the plot
plt.show()

Takeaways: <br>
As per construction, one fourth of the studenst have good grades (Grade == 1) while three fourth of the class do not have good grades (Grade == 0)

In [None]:
# Filter the data for students that achieved Grade==1
grade_1_data = dfP[dfP['Grade'] == 1]

# List of categorical variables
cat_vars = ['Gender', 'EthnicGroup', 'ParentEduc', 'TestPrep', 'LunchType', 'ParentMaritalStatus', 
            'PracticeSport', 'IsFirstChild', 'NrSiblings', 'School_Bus', 'WklyStudyHours']

# Melt the DataFrame to have two columns: one for variable names and one for values
melted_data = grade_1_data[cat_vars].melt()

# Create a FacetGrid
g = sns.FacetGrid(data=melted_data, col_wrap=3, col='variable', sharex=False, sharey=False, height=4)
g = g.map(sns.countplot, 'value', palette='Set2')

# Set axis labels, titles, and x-axis tick labels orientation
g.set_axis_labels('Count', 'Category')
g.set_titles('{col_name}')
g.set_xticklabels(rotation=90)

# Annotate the count on top of every bar
for ax in g.axes.flat:
    for p in ax.patches:
        ax.annotate(f'{p.get_height()}', 
                    (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', 
                    va='center', 
                    xytext=(0, 5), 
                    textcoords='offset points')

# Adjust the spacing between subplots
plt.tight_layout()

# Display the plots
plt.show()

In [None]:
# Pair plot
sns.pairplot(data=dfP, hue='Grade', diag_kind='kde')
plt.show()

In [None]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

# Create a DataFrame to hold the Cramer's V values
correlation_matrix = pd.DataFrame(index=dfP.columns, columns=dfP.columns)

# Fill the DataFrame with Cramer's V values
for i in dfP.columns:
    for j in dfP.columns:
        correlation_matrix.loc[i, j] = cramers_v(dfP[i], dfP[j])

# Convert to numeric
correlation_matrix = correlation_matrix.apply(pd.to_numeric)

# Create the heatmap
sns.heatmap(correlation_matrix, annot=False)
plt.show()
