In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import yaml

# First, we load up the config file to know what data and features we're dealing with.
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Now we read the data from the capstone project.
data = pd.read_csv(config['data_path'])

# Let's start by looking at the distribution of average grades across all students.
plt.figure(figsize = (10, 6))
sns.histplot(data['average_grade'], kde = True)
plt.title('Distribution of Average Grades')
plt.show()

# Next, we’ll create a heatmap to see if any of the features are correlated.
plt.figure(figsize = (12, 8))
sns.heatmap(data[config['features']].corr(), annot = True, cmap = 'coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# How does the number of absences relate to dropout risk?
plt.figure(figsize=(10, 6))
sns.boxplot(x = 'dropout_risk', y = 'number_of_absences', data = data)
plt.title('Absences and Dropout Risk')
plt.show()

# Finally, let's see if students who submit homework on time are less likely to drop out.
plt.figure(figsize=(10, 6))
sns.boxplot(x = 'dropout_risk', y = 'homework_submission_rate', data = data)
plt.title('Homework Submission Rate vs Dropout Risk')
plt.show()
