# Lab 5: High School Students
*Author: Jeremy Gutierrez*
*Date: 11/17/2025*

This script downloads the UCI Student Performance CSVs, performs EDA, visualization,
and runs the requested data mining models (Decision Tree and Naive Bayes).

In [47]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay,
accuracy_score, classification_report)
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import LabelEncoder

In [48]:
# Where to save generated figures
OUT_DIR = 'lab5_outputs'
os.makedirs(OUT_DIR, exist_ok=True)

# CSV files needed
MAT_URL = "student-mat.csv"
POR_URL = "student-por.csv"

In [49]:
def load_data(url):
#Load a semicolon-separated CSV from the provided URL or local path. Returns a pandas DataFrame.
    try:
        df = pd.read_csv(url, sep=';')
    except Exception as e:
        raise RuntimeError(f"Failed to read {url}: {e}")
    return df

def save_fig(fig, name):
    path = os.path.join(OUT_DIR, name)
    fig.tight_layout()
    fig.savefig(path, dpi=150)
    print(f'Saved figure: {path}')

In [50]:
# Loading data
print('\n1) Loading datasets...')
math_df = load_data(MAT_URL)
por_df = load_data(POR_URL)

# Rows and columns
print('\n1a) Shapes:')
print('math_df shape:', math_df.shape)
print('por_df shape: ', por_df.shape)

# Determine dtypes: treat object dtype as categorical, numeric dtypes as numeric
def get_feature_types(df):
    cat = df.select_dtypes(include=['object']).columns.tolist()
    num = df.select_dtypes(include=[np.number]).columns.tolist()
    return cat, num

math_cat, math_num = get_feature_types(math_df)
por_cat, por_num = get_feature_types(por_df)

print('\n1b) Feature types for student-mat.csv:')
print('Categorical ({}):'.format(len(math_cat)), math_cat)
print('Numeric ({}):'.format(len(math_num)), math_num)

print('\n1b) Feature types for student-por.csv:')
print('Categorical ({}):'.format(len(por_cat)), por_cat)
print('Numeric ({}):'.format(len(por_num)), por_num)


1) Loading datasets...

1a) Shapes:
math_df shape: (395, 33)
por_df shape:  (649, 33)

1b) Feature types for student-mat.csv:
Categorical (17): ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']
Numeric (16): ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']

1b) Feature types for student-por.csv:
Categorical (17): ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']
Numeric (16): ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']


In [51]:
print('\n2) Correlation analysis...')

# For correlation, convert categorical variables to codes (label encoding) so we can compute correlation matrix.
def encode_for_corr(df):
    enc = df.copy()
    for col in enc.columns:
        if enc[col].dtype == object:
            enc[col] = enc[col].astype('category').cat.codes
    return enc

math_enc = encode_for_corr(math_df)
por_enc = encode_for_corr(por_df)

# Compute correlation matrices
math_corr = math_enc.corr()
por_corr = por_enc.corr()

# Visualize heatmaps for each
fig, ax = plt.subplots(figsize=(14,10))
sns.heatmap(math_corr, cmap='coolwarm', center=0, ax=ax)
ax.set_title('Correlation matrix — student-mat')
sns.despine()
save_fig(fig, 'heatmap_student_mat_corr.png')
plt.close(fig)

fig, ax = plt.subplots(figsize=(14,10))
sns.heatmap(por_corr, cmap='coolwarm', center=0, ax=ax)
ax.set_title('Correlation matrix — student-por')
sns.despine()
save_fig(fig, 'heatmap_student_por_corr.png')
plt.close(fig)

# Find top 3 features correlated with each of G1, G2, G3 (by absolute value, excluding the target itself)
def top3_correlated(corr_df, target):
    s = corr_df[target].abs().sort_values(ascending=False)
    # remove the target itself
    s = s[s.index != target]
    return s.head(3)

print('\nTop 3 features correlated with G1, G2, G3 in student-mat:')
for t in ['G1','G2','G3']:
    print(t, '\n', top3_correlated(math_corr, t), '\n')

print('\nTop 3 features correlated with G1, G2, G3 in student-por:')
for t in ['G1','G2','G3']:
    print(t, '\n', top3_correlated(por_corr, t), '\n')


2) Correlation analysis...
Saved figure: lab5_outputs\heatmap_student_mat_corr.png
Saved figure: lab5_outputs\heatmap_student_por_corr.png

Top 3 features correlated with G1, G2, G3 in student-mat:
G1 
 G2          0.852118
G3          0.801468
failures    0.354718
Name: G1, dtype: float64 

G2 
 G3          0.904868
G1          0.852118
failures    0.355896
Name: G2, dtype: float64 

G3 
 G2          0.904868
G1          0.801468
failures    0.360415
Name: G3, dtype: float64 


Top 3 features correlated with G1, G2, G3 in student-por:
G1 
 G2          0.864982
G3          0.826387
failures    0.384210
Name: G1, dtype: float64 

G2 
 G3          0.918548
G1          0.864982
failures    0.385782
Name: G2, dtype: float64 

G3 
 G2          0.918548
G1          0.826387
failures    0.393316
Name: G3, dtype: float64 



In [52]:
# Plotting the data in bar graphs
print('\n3) Creating bar plots...')


# Choose four attributes to visualize for each dataset.
math_attrs = ['sex', 'age', 'studytime', 'failures']
por_attrs = ['sex', 'age', 'studytime', 'activities']


# For each attribute show counts or aggregated mean G3


def bar_plots(df, attrs, prefix):
    for a in attrs:
        if df[a].dtype == object:
            # count plot
            fig, ax = plt.subplots(figsize=(6,4))
            sns.countplot(x=a, data=df, ax=ax)
            ax.set_title(f'Count of {a} ({prefix})')
            save_fig(fig, f'{prefix}_bar_count_{a}.png')
            plt.close(fig)
        else:
            # numeric: show average final grade by attribute value
            fig, ax = plt.subplots(figsize=(6,4))
            df.groupby(a)['G3'].mean().plot(kind='bar', ax=ax)
            ax.set_ylabel('Average G3')
            ax.set_title(f'Average G3 by {a} ({prefix})')
            save_fig(fig, f'{prefix}_bar_avgG3_{a}.png')
            plt.close(fig)

bar_plots(math_df, math_attrs, 'mat')
bar_plots(por_df, por_attrs, 'por')


3) Creating bar plots...
Saved figure: lab5_outputs\mat_bar_count_sex.png
Saved figure: lab5_outputs\mat_bar_avgG3_age.png
Saved figure: lab5_outputs\mat_bar_avgG3_studytime.png
Saved figure: lab5_outputs\mat_bar_avgG3_failures.png
Saved figure: lab5_outputs\por_bar_count_sex.png
Saved figure: lab5_outputs\por_bar_avgG3_age.png
Saved figure: lab5_outputs\por_bar_avgG3_studytime.png
Saved figure: lab5_outputs\por_bar_count_activities.png


In [53]:
# Plotting the data in line graphs
print('\n4) Creating line plots...')

# Use simple line plots: e.g., average G3 across ages or across studytime levels
math_line_attrs = ['age', 'studytime', 'goout', 'absences']
por_line_attrs = ['age', 'studytime', 'goout', 'absences']

def line_plots(df, attrs, prefix):
    for a in attrs:
        if df[a].dtype == object:
            continue
        fig, ax = plt.subplots(figsize=(6,4))
        # compute mean G3 across the attribute
        grouped = df.groupby(a)['G3'].mean()
        grouped.plot(kind='line', marker='o', ax=ax)
        ax.set_ylabel('Average G3')
        ax.set_title(f'Average G3 vs {a} ({prefix})')
        save_fig(fig, f'{prefix}_line_avgG3_{a}.png')
        plt.close(fig)

line_plots(math_df, math_line_attrs, 'mat')
line_plots(por_df, por_line_attrs, 'por')


4) Creating line plots...
Saved figure: lab5_outputs\mat_line_avgG3_age.png
Saved figure: lab5_outputs\mat_line_avgG3_studytime.png
Saved figure: lab5_outputs\mat_line_avgG3_goout.png
Saved figure: lab5_outputs\mat_line_avgG3_absences.png
Saved figure: lab5_outputs\por_line_avgG3_age.png
Saved figure: lab5_outputs\por_line_avgG3_studytime.png
Saved figure: lab5_outputs\por_line_avgG3_goout.png
Saved figure: lab5_outputs\por_line_avgG3_absences.png


In [37]:
print('\nDATA MINING TASKS:')

# Utility: prepare dataset for modeling: encode categorical features

def prepare_for_modeling(df, features, target=None, drop_G1G2G3=False):
    X = df[features].copy()
    for col in X.columns:
        if X[col].dtype == object:
            X[col] = X[col].astype('category').cat.codes
    if target:
        y = df[target].copy()
        return X, y
    return X

# Decision Tree for student-mat.csv
print('\nDecision Tree (student-mat)')
# Define a simple classification target: binarize final grade G3 into 'pass' (>=10) vs 'fail' (<10)
# (You can change threshold per your mining goals)
math_df['pass10'] = (math_df['G3'] >= 10).astype(int)

# Choose a small set of features to build interpretable tree
features_dt = ['studytime', 'failures', 'absences', 'G1', 'G2', 'schoolsup']
X_dt, y_dt = prepare_for_modeling(math_df, features_dt, target='pass10')

X_train, X_test, y_train, y_test = train_test_split(X_dt, y_dt, test_size=0.2, random_state=42, stratify=y_dt)

clf = DecisionTreeClassifier(max_depth=4, random_state=42)
clf.fit(X_train, y_train)

# Extract textual rules
rules_text = export_text(clf, feature_names=list(X_dt.columns))
print('\nExtracted decision rules (text):\n')
print(rules_text)

# Visualize the tree
fig, ax = plt.subplots(figsize=(14,8))
plot_tree(clf, feature_names=X_dt.columns, class_names=['fail','pass'], filled=True, rounded=True, proportion=False, ax=ax)
ax.set_title('Decision Tree (student-mat) — max_depth=4')
save_fig(fig, 'decision_tree_student_mat.png')
plt.close(fig)

# Evaluate
y_pred = clf.predict(X_test)
print('\nDecision Tree classification report:')
print(classification_report(y_test, y_pred))


DATA MINING TASKS:

Decision Tree (student-mat)

Extracted decision rules (text):

|--- G2 <= 9.50
|   |--- G2 <= 8.50
|   |   |--- G2 <= 7.50
|   |   |   |--- class: 0
|   |   |--- G2 >  7.50
|   |   |   |--- absences <= 7.00
|   |   |   |   |--- class: 0
|   |   |   |--- absences >  7.00
|   |   |   |   |--- class: 0
|   |--- G2 >  8.50
|   |   |--- G1 <= 7.50
|   |   |   |--- class: 0
|   |   |--- G1 >  7.50
|   |   |   |--- absences <= 10.50
|   |   |   |   |--- class: 0
|   |   |   |--- absences >  10.50
|   |   |   |   |--- class: 0
|--- G2 >  9.50
|   |--- G1 <= 10.50
|   |   |--- absences <= 1.00
|   |   |   |--- G1 <= 8.50
|   |   |   |   |--- class: 1
|   |   |   |--- G1 >  8.50
|   |   |   |   |--- class: 0
|   |   |--- absences >  1.00
|   |   |   |--- schoolsup <= 0.50
|   |   |   |   |--- class: 1
|   |   |   |--- schoolsup >  0.50
|   |   |   |   |--- class: 1
|   |--- G1 >  10.50
|   |   |--- class: 1

Saved figure: lab5_outputs\decision_tree_student_mat.png

Decision 

In [38]:
# Naive Bayes for student-por.csv
print('\nNaive Bayes (student-por)')
# Create binary target passed (1 if G3 >=12, else 0) and show distribution
por_df['passed_12'] = (por_df['G3'] >= 12).astype(int)
pass_dist = por_df['passed_12'].value_counts(normalize=True) * 100
print('\nClass distribution (passed vs failed) for threshold G3>=12:')
print(pass_dist)


Naive Bayes (student-por)

Class distribution (passed vs failed) for threshold G3>=12:
passed_12
1    53.620955
0    46.379045
Name: proportion, dtype: float64


In [39]:
#Using studytime, absences, and G1 to predict pass/fail with GaussianNB
features_nb = ['studytime', 'absences', 'G1']
X_nb, y_nb = prepare_for_modeling(por_df, features_nb, target='passed_12')


X_train, X_test, y_train, y_test = train_test_split(X_nb, y_nb, test_size=0.2, random_state=42, stratify=y_nb)


gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)


print('\nGaussianNB classification report (studytime, absences, G1 -> passed_12):')
print(classification_report(y_test, y_pred))


# Scatter True vs Pred (1D scatter with jitter)
fig, ax = plt.subplots(figsize=(6,4))
ax.scatter(range(len(y_test)), y_test, label='True', marker='o')
ax.scatter(range(len(y_pred)), y_pred + 0.05, label='Predicted', marker='x')
ax.set_title('True vs Predicted (GaussianNB) — por')
ax.set_xlabel('Test sample index')
ax.set_yticks([0,1])
ax.legend()
save_fig(fig, 'nb_por_true_vs_pred.png')
plt.close(fig)


# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(4,4))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['fail','pass'])
disp.plot(ax=ax)
ax.set_title('Confusion Matrix (GaussianNB)')
save_fig(fig, 'nb_por_confusion_matrix.png')
plt.close(fig)


GaussianNB classification report (studytime, absences, G1 -> passed_12):
              precision    recall  f1-score   support

           0       0.76      0.87      0.81        60
           1       0.87      0.77      0.82        70

    accuracy                           0.82       130
   macro avg       0.82      0.82      0.82       130
weighted avg       0.82      0.82      0.82       130

Saved figure: lab5_outputs\nb_por_true_vs_pred.png
Saved figure: lab5_outputs\nb_por_confusion_matrix.png


In [41]:
# Multinomial Naive Bayes for predicting low/medium/high G3 using features
# Define G3 levels: low (0-9), medium (10-14), high (15-20) — you may adjust
por_df['G3_level'] = pd.cut(por_df['G3'], bins=[-1,9,14,20], labels=['low','medium','high'])

features_multi = ['studytime', 'failures', 'schoolsup', 'famsup', 'activities']
X_multi = prepare_for_modeling(por_df, features_multi)
# Convert target to numeric labels
le = LabelEncoder()
y_multi = le.fit_transform(por_df['G3_level'].astype(str))

# MultinomialNB requires non-negative integer features; prepare by scaling category codes to ints
# Here we will round values and ensure non-negativity
X_multi_nonneg = X_multi.copy()
# If any negative values appear due to category codes they will be shifted
min_val = X_multi_nonneg.min().min()
if min_val < 0:
    X_multi_nonneg = X_multi_nonneg - min_val

mnb = MultinomialNB()
X_train, X_test, y_train, y_test = train_test_split(X_multi_nonneg, y_multi, test_size=0.2, random_state=42, stratify=y_multi)

mnb.fit(X_train, y_train)
ypred_m = mnb.predict(X_test)

print('\nMultinomialNB classification report (G3 level prediction):')
print(classification_report(y_test, ypred_m, 
                            target_names=le.classes_,
                            zero_division=0))

cm_m = confusion_matrix(y_test, ypred_m)
fig, ax = plt.subplots(figsize=(5,5))
disp = ConfusionMatrixDisplay(confusion_matrix=cm_m, display_labels=le.classes_)
disp.plot(ax=ax)
ax.set_title('Confusion Matrix (MultinomialNB)')
save_fig(fig, 'mnb_por_confusion_matrix.png')
plt.close(fig)


MultinomialNB classification report (G3 level prediction):
              precision    recall  f1-score   support

        high       0.00      0.00      0.00        26
         low       0.86      0.30      0.44        20
      medium       0.67      0.99      0.80        84

    accuracy                           0.68       130
   macro avg       0.51      0.43      0.42       130
weighted avg       0.57      0.68      0.59       130

Saved figure: lab5_outputs\mnb_por_confusion_matrix.png


In [45]:
# Save results to a CSV file
print('\nAll steps completed. Generated figures are saved in the folder:', OUT_DIR)
print('Please run this script (or the same steps in a notebook) to reproduce the results and upload outputs to GitHub as required by the assignment.')

results_df = pd.DataFrame({
    'DT_y_test': y_test if 'y_test' in globals() else [],
    'DT_y_pred': y_pred if 'y_pred' in globals() else [],
    'NB_y_test': y_test if 'y_test' in globals() else [],
    'NB_y_pred': y_pred if 'y_pred' in globals() else []
})

results_path = os.path.join(OUT_DIR, 'model_results.csv')
results_df.to_csv(results_path, index=False)
print(f"Saved model results to {results_path}")


All steps completed. Generated figures are saved in the folder: lab5_outputs
Please run this script (or the same steps in a notebook) to reproduce the results and upload outputs to GitHub as required by the assignment.
Saved model results to lab5_outputs\model_results.csv
