<a href="https://colab.research.google.com/github/BrunTitoWars/TopAvanIA/blob/main/final_ai_topics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing database

In [202]:
import pandas as pd
from rich.jupyter import display

In [203]:
database_mat = pd.read_csv('student-mat.csv', sep=';')
print(f'{database_mat.shape[0]} lines x {database_mat.shape[1]} columns of mat students')

database_por = pd.read_csv('student-por.csv', sep=';')
print(f'{database_por.shape[0]} lines x {database_por.shape[1]} columns of por students')

395 lines x 33 columns of mat students
649 lines x 33 columns of por students


In [204]:
database_mat = database_mat[['G1', 'G2', 'G3']]
database_mat.head()

Unnamed: 0,G1,G2,G3
0,5,6,6
1,5,5,6
2,7,8,10
3,15,14,15
4,6,10,10


In [205]:
database_por = database_por[['G1', 'G2', 'G3']]
database_por.head()

Unnamed: 0,G1,G2,G3
0,0,11,11
1,9,11,11
2,12,13,12
3,14,14,14
4,11,13,13


# Results analysis

### Paper's table 2: Results of the descriptive analysis

In [206]:
from scipy.stats import pearsonr
from IPython.display import display

variable_labels = {
    "G1": "First year",
    "G2": "Second year",
    "G3": "Third year"
}

variables = list(variable_labels.keys())

stats_mat = {
    "Variable": [variable_labels[var] for var in variables],
    "Mean": [database_mat[var].mean() for var in variables],
    "Standard Deviation": [database_mat[var].std() for var in variables]
}

stats_por = {
    "Variable": [variable_labels[var] for var in variables],
    "Mean": [database_por[var].mean() for var in variables],
    "Standard Deviation": [database_por[var].std() for var in variables]
}

stats_table_mat = pd.DataFrame(stats_mat)

stats_table_por = pd.DataFrame(stats_por)

In [207]:
display(stats_table_mat)

Unnamed: 0,Variable,Mean,Standard Deviation
0,First year,10.908861,3.319195
1,Second year,10.713924,3.761505
2,Third year,10.41519,4.581443


In [208]:
display(stats_table_por)

Unnamed: 0,Variable,Mean,Standard Deviation
0,First year,11.399076,2.745265
1,Second year,11.570108,2.913639
2,Third year,11.906009,3.230656


### Paper's table 3: Results of the descriptive correlation analysis

In [209]:
correlation_data_mat = {variable_labels[var]: [] for var in variables}

for var1 in variables:
    for var2 in variables:
        if var1 == var2:
            correlation_data_mat[variable_labels[var2]].append("1")
        else:
            r, p = pearsonr(database_mat[var1], database_mat[var2])
            formatted = f"{r:.3f}" + ("**" if p < 0.01 else "")
            correlation_data_mat[variable_labels[var2]].append(formatted)

correlation_table_mat = pd.DataFrame(correlation_data_mat, index=[variable_labels[v] for v in variables])

display(correlation_table_mat)

Unnamed: 0,First year,Second year,Third year
First year,1,0.852**,0.801**
Second year,0.852**,1,0.905**
Third year,0.801**,0.905**,1


In [210]:
correlation_data_por = {variable_labels[var]: [] for var in variables}

for var1 in variables:
    for var2 in variables:
        if var1 == var2:
            correlation_data_por[variable_labels[var2]].append("1")
        else:
            r, p = pearsonr(database_por[var1], database_por[var2])
            formatted = f"{r:.3f}" + ("**" if p < 0.01 else "")
            correlation_data_por[variable_labels[var2]].append(formatted)

correlation_table_por = pd.DataFrame(correlation_data_por, index=[variable_labels[v] for v in variables])

display(correlation_table_por)

Unnamed: 0,First year,Second year,Third year
First year,1,0.865**,0.826**
Second year,0.865**,1,0.919**
Third year,0.826**,0.919**,1


## Training Model

In [211]:
import statsmodels.api as sm

def train_model(y_column, x_columns, data):
    X = sm.add_constant(data[x_columns])
    y = data[y_column]
    model = sm.OLS(y, X).fit()
    return model

rl_mat_model = train_model("G3", ["G1", "G2"], database_mat)

rl_por_model = train_model("G3", ["G1", "G2"], database_por)

### Paper's table 4: Multiple regression adjustment model

In [212]:
def generate_model_line(model_name, model):
    r_squared = model.rsquared
    adj_r_squared = model.rsquared_adj
    r = r_squared**0.5
    std_error = model.mse_resid**0.5
    f_stat = model.fvalue
    f_pvalue = model.f_pvalue

    return {
        "Model": model_name,
        "R": round(r, 3),
        "R Square": round(r_squared, 3),
        "Adjusted R Square": round(adj_r_squared, 3),
        "Standard Error": round(std_error, 3),
        "F Statistic": round(f_stat, 3),
        "Sig. F Change": f_pvalue if f_pvalue >= 0.001 else "< 0.001"
    }

lines = [
    generate_model_line("Third year - Mat", rl_mat_model),
    generate_model_line("Third year - Por", rl_por_model)
]

regression_table = pd.DataFrame(lines)
display(regression_table)

Unnamed: 0,Model,R,R Square,Adjusted R Square,Standard Error,F Statistic,Sig. F Change
0,Third year - Mat,0.907,0.822,0.821,1.937,906.134,< 0.001
1,Third year - Por,0.921,0.848,0.847,1.262,1798.671,< 0.001


In [213]:
intercept_rl_mat = rl_mat_model.params["const"]
beta1_rl_mat = rl_mat_model.params["G1"]
beta2_rl_mat = rl_mat_model.params["G2"]

intercept_rl_por = rl_por_model.params["const"]
beta1_rl_por = rl_por_model.params["G1"]
beta2_rl_por = rl_por_model.params["G2"]

## The regression equation

In [214]:
equation_mat = f"Third year = {intercept_rl_mat:.1f} + [{beta1_rl_mat:.1f} * (Fist year)] + [{beta2_rl_mat:.1f} * (Second year)]"
print("Regression Equation - Mat:")
print(equation_mat)

Regression Equation - Mat:
Third year = -1.8 + [0.2 * (Fist year)] + [1.0 * (Second year)]


In [215]:
equation_por = f"Third year = {intercept_rl_por:.1f} + [{beta1_rl_por:.1f} * (First year)] + [{beta2_rl_por:.1f} * (Second year)]"
print("Regression Equation - Por:")
print(equation_por)

Regression Equation - Por:
Third year = -0.2 + [0.1 * (First year)] + [0.9 * (Second year)]


## Comparing predicted and actual values

In [216]:
database_mat["original_predicted_g3"] = (
    intercept_g3_mat +
    beta1_g3_mat * database_mat["G1"] +
    beta2_g3_mat * database_mat["G2"]
).round(2)

database_mat["original_absolute_error"] = (
    database_mat["G3"] - database_mat["original_predicted_g3"]
).abs().round(2)

database_mat[["G3", "original_predicted_g3", "original_absolute_error"]].head()

Unnamed: 0,G3,original_predicted_g3,original_absolute_error
0,6,4.86,1.14
1,6,3.87,2.13
2,10,7.14,2.86
3,15,14.29,0.71
4,10,8.96,1.04


In [217]:
database_por["original_predicted_g3"] = (
    intercept_g3_por +
    beta1_g3_por * database_por["G1"] +
    beta2_g3_por * database_por["G2"]
).round(2)

database_por["original_absolute_error"] = (
    database_por["G3"] - database_por["original_predicted_g3"]
).abs().round(2)

database_por[["G3", "original_predicted_g3", "original_absolute_error"]].head()

Unnamed: 0,G3,original_predicted_g3,original_absolute_error
0,11,9.7,1.3
1,11,11.04,0.04
2,12,13.28,1.28
3,14,14.47,0.47
4,13,13.13,0.13


In [218]:
variable_labels = {
    "G3": "G3 - Mat",
    "original_predicted_g3": "Original predicted G3 - Mat",
    "original_absolute_error": "Original absolute error - Mat"
}

variables = list(variable_labels.keys())

stats_mat = {
    "Variable": [variable_labels[var] for var in variables],
    "Mean": [database_mat[var].mean() for var in variables],
    "Standard Deviation": [database_mat[var].std() for var in variables]
}

stats_table_mat = pd.DataFrame(stats_mat)

display(stats_table_mat)

Unnamed: 0,Variable,Mean,Standard Deviation
0,G3 - Mat,10.41519,4.581443
1,Original predicted G3 - Mat,10.414861,4.155055
2,Original absolute error - Mat,1.137848,1.5601


In [219]:
variable_labels = {
    "G3": "G3 - Por",
    "original_predicted_g3": "Original predicted G3 - Por",
    "original_absolute_error": "Original absolute error - Por"
}

variables = list(variable_labels.keys())

stats_por = {
    "Variable": [variable_labels[var] for var in variables],
    "Mean": [database_por[var].mean() for var in variables],
    "Standard Deviation": [database_por[var].std() for var in variables]
}

stats_table_por = pd.DataFrame(stats_por)

display(stats_table_por)

Unnamed: 0,Variable,Mean,Standard Deviation
0,G3 - Por,11.906009,3.230656
1,Original predicted G3 - Por,11.90621,2.974618
2,Original absolute error - Por,0.785285,0.985733


# Our change purpose

## XGBoost Regressor

In [220]:
from xgboost import XGBRegressor
import sklearn

In [221]:
xgb_model_mat = XGBRegressor()

xgb_model_por = XGBRegressor()

In [222]:
from sklearn.model_selection import train_test_split

X_mat = sm.add_constant(database_mat[["G1", "G2"]])
y_mat = database_mat['G3']

xgb_model_mat.fit(X_mat, y_mat)

X_mat_train, X_mat_test, y_mat_train, y_mat_test = train_test_split(X_mat, y_mat, test_size=0.2, random_state=42)

y_mat_pred = xgb_model_mat.predict(X_mat_test)

In [223]:
X_por = sm.add_constant(database_por[["G1", "G2"]])
y_por = database_por['G3']

xgb_model_por.fit(X_por, y_por)

X_por_train, X_por_test, y_por_train, y_por_test = train_test_split(X_por, y_por, test_size=0.2, random_state=42)

y_por_pred = xgb_model_por.predict(X_por_test)

In [224]:
results_mat = X_mat_test.copy()
results_mat["G3"] = y_mat_test
results_mat["original_predicted_g3"] = (
    intercept_g3_mat +
    beta1_g3_mat * results_mat["G1"] +
    beta2_g3_mat * results_mat["G2"]
).round(2)
results_mat["original_absolute_error"] = (
    results_mat["G3"] - results_mat["original_predicted_g3"]
).abs().round(2)
results_mat["xgboost_predicted_g3"] = y_mat_pred.round(2)
results_mat["xgboost_absolute_error"] = (results_mat["G3"] - results_mat["xgboost_predicted_g3"]).abs().round(2)

results_mat[["G3", "xgboost_predicted_g3", "xgboost_absolute_error"]].head()

Unnamed: 0,G3,xgboost_predicted_g3,xgboost_absolute_error
78,10,7.09,2.91
371,12,12.0,0.0
248,5,5.0,0.0
55,10,9.36,0.64
390,9,8.69,0.31


In [225]:
results_por = X_por_test.copy()
results_por["G3"] = y_por_test
results_por["original_predicted_g3"] = (
    intercept_g3_por +
    beta1_g3_por * results_por["G1"] +
    beta2_g3_por * results_por["G2"]
).round(2)
results_por["original_absolute_error"] = (
    results_por["G3"] - results_por["original_predicted_g3"]
).abs().round(2)
results_por["xgboost_predicted_g3"] = y_por_pred.round(2)
results_por["xgboost_absolute_error"] = (results_por["G3"] - results_por["xgboost_predicted_g3"]).abs().round(2)

results_por[["G3", "xgboost_predicted_g3", "xgboost_absolute_error"]].head()

Unnamed: 0,G3,xgboost_predicted_g3,xgboost_absolute_error
636,19,17.629999,1.37
220,12,11.39,0.61
594,18,17.799999,0.2
429,11,11.19,0.19
72,11,11.01,0.01


In [226]:
variable_labels = {
    "G3": "G3 - Mat",
    "original_predicted_g3": "Original predicted G3 - Mat",
    "original_absolute_error": "Original absolute error - Mat",
    "xgboost_predicted_g3": "XGBoost predicted G3 - Mat",
    "xgboost_absolute_error": "XGBoost absolute error - Mat"
}

variables = list(variable_labels.keys())

stats_mat = {
    "Variable": [variable_labels[var] for var in variables],
    "Mean": [results_mat[var].mean() for var in variables],
    "Standard Deviation": [results_mat[var].std() for var in variables]
}

stats_table_mat = pd.DataFrame(stats_mat)

display(stats_table_mat)

Unnamed: 0,Variable,Mean,Standard Deviation
0,G3 - Mat,10.772152,4.557185
1,Original predicted G3 - Mat,10.64481,4.283469
2,Original absolute error - Mat,1.233671,1.63983
3,XGBoost predicted G3 - Mat,10.637848,4.367468
4,XGBoost absolute error - Mat,1.06038,1.456577


In [227]:
variable_labels = {
    "G3": "G3 - Por",
    "original_predicted_g3": "Original predicted G3 - Por",
    "original_absolute_error": "Original absolute error - Por",
    "xgboost_predicted_g3": "XGBoost predicted G3 - Por",
    "xgboost_absolute_error": "XGBoost absolute error - Por"
}

variables = list(variable_labels.keys())

stats_por = {
    "Variable": [variable_labels[var] for var in variables],
    "Mean": [results_por[var].mean() for var in variables],
    "Standard Deviation": [results_por[var].std() for var in variables]
}

stats_table_por = pd.DataFrame(stats_por)

display(stats_table_por)

Unnamed: 0,Variable,Mean,Standard Deviation
0,G3 - Por,12.353846,3.134854
1,Original predicted G3 - Por,12.395308,3.104232
2,Original absolute error - Por,0.718231,0.898882
3,XGBoost predicted G3 - Por,12.374847,2.984588
4,XGBoost absolute error - Por,0.655923,0.904929


In [234]:
def generate_model_line(model_name, model):
    r_squared = model.rsquared
    adj_r_squared = model.rsquared_adj
    r = r_squared**0.5
    std_error = model.mse_resid**0.5
    f_stat = model.fvalue
    f_pvalue = model.f_pvalue

    return {
        "Model": model_name,
        "R": round(r, 3),
        "R Square": round(r_squared, 3),
        "Adjusted R Square": round(adj_r_squared, 3),
        "Standard Error": round(std_error, 3),
        "F Statistic": round(f_stat, 3),
        "Sig. F Change": f_pvalue if f_pvalue >= 0.001 else "< 0.001"
    }

def generate_model_line_XGB(model_name, model, X_test, y_test):
    y_pred = model.predict(X_test)

    r_squared = r2_score(y_test, y_pred)

    n = X_test.shape[0]
    p = X_test.shape[1]
    adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)

    mse_resid = mean_squared_error(y_test, y_pred)
    std_error = np.sqrt(mse_resid)

    return {
        "Model": model_name,
        "R": round(np.sqrt(r_squared), 3),
        "R Square": round(r_squared, 3),
        "Adjusted R Square": round(adj_r_squared, 3),
        "Standard Error": round(std_error, 3),
    }

lines = [
    generate_model_line_XGB("XGBoost - Matemática", xgb_model_mat, X_mat_test, y_mat_test),
    generate_model_line_XGB("XGBoost - Português", xgb_model_por, X_por_test, y_por_test),
    generate_model_line("Linear - Matemática", rl_mat_model),
    generate_model_line("Linear - Português", rl_por_model)
]

regression_table = pd.DataFrame(lines)
display(regression_table)


Unnamed: 0,Model,R,R Square,Adjusted R Square,Standard Error,F Statistic,Sig. F Change
0,XGBoost - Matemática,0.918,0.843,0.837,1.794,,
1,XGBoost - Português,0.934,0.872,0.869,1.115,,
2,Linear - Matemática,0.907,0.822,0.821,1.937,906.134,< 0.001
3,Linear - Português,0.921,0.848,0.847,1.262,1798.671,< 0.001
