In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set seaborn style for plots
sns.set(style="whitegrid")

1. Data Loading

In [None]:
# Load datasets
student_info = pd.read_csv('../data/raw/studentInfo.csv')
student_assessment = pd.read_csv('../data/raw/studentAssessment.csv')
student_registration = pd.read_csv('../data/raw/studentRegistration.csv')
assessments = pd.read_csv('../data/raw/assessments.csv')
courses = pd.read_csv('../data/raw/courses.csv')
student_vle = pd.read_csv('../data/raw/studentVle.csv')
vle = pd.read_csv('../data/raw/vle.csv')

2. Basic Information about the Datasets

In [None]:
print("Student Info:")
display(student_info.info())
display(student_info.head())

print("Student Assessment:")
display(student_assessment.info())
display(student_assessment.head())

print("Student Registration:")
display(student_registration.info())
display(student_registration.head())

3. Missing Value Analysis

In [None]:
print("Missing values in each dataset:")
for name, df in [("student_info", student_info), 
                 ("student_assessment", student_assessment), 
                 ("student_registration", student_registration), 
                 ("assessments", assessments), 
                 ("courses", courses), 
                 ("student_vle", student_vle), 
                 ("vle", vle)]:
    missing_values = df.isnull().sum()
    print(f"{name}:\n{missing_values[missing_values > 0]}\n")

 4. Summary Statistics

In [None]:
print("Summary statistics of student_info:")
display(student_info.describe())

5. Data Distribution Analysis

In [None]:
# Distribution of Final Results
plt.figure(figsize=(8, 6))
sns.countplot(data=student_info, x='final_result')
plt.title("Distribution of Final Results")
plt.xlabel("Final Result")
plt.ylabel("Count")
plt.show()

6. Demographic Analysis

In [None]:
# Gender Distribution by Final Result
plt.figure(figsize=(8, 6))
sns.countplot(data=student_info, x='gender', hue='final_result')
plt.title("Gender Distribution by Final Result")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.legend(title="Final Result")
plt.show()

# Age Band Distribution by Final Result
plt.figure(figsize=(10, 6))
sns.countplot(data=student_info, x='age_band', hue='final_result')
plt.title("Age Band Distribution by Final Result")
plt.xlabel("Age Band")
plt.ylabel("Count")
plt.legend(title="Final Result")
plt.show()

7. Course and Academic Data Analysis

In [None]:
# Distribution of Studied Credits
plt.figure(figsize=(10, 6))
sns.histplot(data=student_info, x='studied_credits', bins=20, kde=True)
plt.title("Distribution of Studied Credits")
plt.xlabel("Studied Credits")
plt.ylabel("Frequency")
plt.show()

# Number of Students Registered per Course
student_registration_counts = student_registration['code_module'].value_counts()
plt.figure(figsize=(10, 6))
sns.barplot(x=student_registration_counts.index, y=student_registration_counts.values)
plt.title("Number of Students Registered per Course")
plt.xlabel("Course Module")
plt.ylabel("Number of Registrations")
plt.show()

8. Assessment Analysis

In [None]:
# Distribution of Assessment Scores
plt.figure(figsize=(12, 6))
sns.histplot(data=student_assessment, x='score', bins=30, kde=True)
plt.title("Distribution of Assessment Scores")
plt.xlabel("Score")
plt.ylabel("Frequency")
plt.show()

9. Virtual Learning Environment (VLE) Analysis

In [None]:
# Sum of Clicks per Student in VLE
student_vle['total_clicks'] = student_vle.groupby('id_student')['sum_click'].transform('sum')
plt.figure(figsize=(12, 6))
sns.histplot(data=student_vle.drop_duplicates('id_student'), x='total_clicks', bins=30, kde=True)
plt.title("Distribution of Total Clicks per Student in VLE")
plt.xlabel("Total Clicks")
plt.ylabel("Frequency")
plt.show()