<a href="https://colab.research.google.com/github/BrunTitoWars/TopAvanIA/blob/main/ai_topics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing database

In [None]:
import pandas as pd
from rich.jupyter import display

In [None]:
database_mat = pd.read_csv('student-mat.csv', sep=';')
print(f'{database_mat.shape[0]} lines x {database_mat.shape[1]} columns of mat students')

database_por = pd.read_csv('student-por.csv', sep=';')
print(f'{database_por.shape[0]} lines x {database_por.shape[1]} columns of por students')

395 lines x 33 columns of mat students
649 lines x 33 columns of por students


In [None]:
database_mat = database_mat[['G1', 'G2', 'G3']]
database_mat.head()

Unnamed: 0,G1,G2,G3
0,5,6,6
1,5,5,6
2,7,8,10
3,15,14,15
4,6,10,10


In [None]:
database_por = database_por[['G1', 'G2', 'G3']]
database_por.head()

Unnamed: 0,G1,G2,G3
0,0,11,11
1,9,11,11
2,12,13,12
3,14,14,14
4,11,13,13


# Results analysis

### Paper's table 2: Results of the descriptive analysis

In [None]:
from scipy.stats import pearsonr
from IPython.display import display

variable_labels = {
    "G1": "First year",
    "G2": "Second year",
    "G3": "Third year"
}

variables = list(variable_labels.keys())

stats_mat = {
    "Variable": [variable_labels[var] for var in variables],
    "Mean": [database_mat[var].mean() for var in variables],
    "Standard Deviation": [database_mat[var].std() for var in variables]
}

stats_por = {
    "Variable": [variable_labels[var] for var in variables],
    "Mean": [database_por[var].mean() for var in variables],
    "Standard Deviation": [database_por[var].std() for var in variables]
}

stats_table_mat = pd.DataFrame(stats_mat)

stats_table_por = pd.DataFrame(stats_por)

In [None]:
display(stats_table_mat)

Unnamed: 0,Variable,Mean,Standard Deviation
0,First year,10.908861,3.319195
1,Second year,10.713924,3.761505
2,Third year,10.41519,4.581443


In [None]:
display(stats_table_por)

Unnamed: 0,Variable,Mean,Standard Deviation
0,First year,11.399076,2.745265
1,Second year,11.570108,2.913639
2,Third year,11.906009,3.230656


### Paper's table 3: Results of the descriptive correlation analysis

In [None]:
correlation_data_mat = {variable_labels[var]: [] for var in variables}

for var1 in variables:
    for var2 in variables:
        if var1 == var2:
            correlation_data_mat[variable_labels[var2]].append("1")
        else:
            r, p = pearsonr(database_mat[var1], database_mat[var2])
            formatted = f"{r:.3f}" + ("**" if p < 0.01 else "")
            correlation_data_mat[variable_labels[var2]].append(formatted)

correlation_table_mat = pd.DataFrame(correlation_data_mat, index=[variable_labels[v] for v in variables])

display(correlation_table_mat)

Unnamed: 0,First year,Second year,Third year
First year,1,0.852**,0.801**
Second year,0.852**,1,0.905**
Third year,0.801**,0.905**,1


In [None]:
correlation_data_por = {variable_labels[var]: [] for var in variables}

for var1 in variables:
    for var2 in variables:
        if var1 == var2:
            correlation_data_por[variable_labels[var2]].append("1")
        else:
            r, p = pearsonr(database_por[var1], database_por[var2])
            formatted = f"{r:.3f}" + ("**" if p < 0.01 else "")
            correlation_data_por[variable_labels[var2]].append(formatted)

correlation_table_por = pd.DataFrame(correlation_data_por, index=[variable_labels[v] for v in variables])

display(correlation_table_por)

Unnamed: 0,First year,Second year,Third year
First year,1,0.865**,0.826**
Second year,0.865**,1,0.919**
Third year,0.826**,0.919**,1


In [None]:
import statsmodels.api as sm

def train_model(y_column, x_columns, data):
    X = sm.add_constant(data[x_columns])
    y = data[y_column]
    model = sm.OLS(y, X).fit()
    return model

g3_mat_model = train_model("G3", ["G1", "G2"], database_mat)

g3_por_model = train_model("G3", ["G1", "G2"], database_por)

### Paper's table 4: Multiple regression adjustment model

In [None]:
def generate_model_line(model_name, model):
    r_squared = model.rsquared
    adj_r_squared = model.rsquared_adj
    r = r_squared**0.5
    std_error = model.mse_resid**0.5
    f_stat = model.fvalue
    f_pvalue = model.f_pvalue

    return {
        "Model": model_name,
        "R": round(r, 3),
        "R Square": round(r_squared, 3),
        "Adjusted R Square": round(adj_r_squared, 3),
        "Standard Error": round(std_error, 3),
        "F Statistic": round(f_stat, 3),
        "Sig. F Change": f_pvalue if f_pvalue >= 0.001 else "< 0.001"
    }

lines = [
    generate_model_line("Third year - Mat", g3_mat_model),
    generate_model_line("Third year - Por", g3_por_model)
]

regression_table = pd.DataFrame(lines)
display(regression_table)

Unnamed: 0,Model,R,R Square,Adjusted R Square,Standard Error,F Statistic,Sig. F Change
0,Third year - Mat,0.907,0.822,0.821,1.937,906.134,< 0.001
1,Third year - Por,0.921,0.848,0.847,1.262,1798.671,< 0.001


In [None]:
intercept_g3_mat = g3_mat_model.params["const"]
beta1_g3_mat = g3_mat_model.params["G1"]
beta2_g3_mat = g3_mat_model.params["G2"]

intercept_g3_por = g3_por_model.params["const"]
beta1_g3_por = g3_por_model.params["G1"]
beta2_g3_por = g3_por_model.params["G2"]

## The regression equation

In [None]:
equation_mat = f"Third year = {intercept_g3_mat:.1f} + [{beta1_g3_mat:.1f} * (Fist year)] + [{beta2_g3_mat:.1f} * (Second year)]"
print("Regression Equation - Mat:")
print(equation_mat)

Regression Equation - Mat:
Third year = -1.8 + [0.2 * (Fist year)] + [1.0 * (Second year)]


In [None]:
equation_por = f"Third year = {intercept_g3_por:.1f} + [{beta1_g3_por:.1f} * (First year)] + [{beta2_g3_por:.1f} * (Second year)]"
print("Regression Equation - Por:")
print(equation_por)

Regression Equation - Por:
Third year = -0.2 + [0.1 * (First year)] + [0.9 * (Second year)]


## Comparing predicted and actual values

In [None]:
database_mat["original_predicted_g3"] = (
    intercept_g3_mat +
    beta1_g3_mat * database_mat["G1"] +
    beta2_g3_mat * database_mat["G2"]
).round(2)

database_mat["original_absolute_error"] = (
    database_mat["G3"] - database_mat["original_predicted_g3"]
).abs().round(2)

database_mat[["G3", "original_predicted_g3", "original_absolute_error"]].head()

Unnamed: 0,G3,original_predicted_g3,original_absolute_error
0,6,4.86,1.14
1,6,3.87,2.13
2,10,7.14,2.86
3,15,14.29,0.71
4,10,8.96,1.04


In [None]:
database_por["original_predicted_g3"] = (
    intercept_g3_por +
    beta1_g3_por * database_por["G1"] +
    beta2_g3_por * database_por["G2"]
).round(2)

database_por["original_absolute_error"] = (
    database_por["G3"] - database_por["original_predicted_g3"]
).abs().round(2)

database_por[["G3", "original_predicted_g3", "original_absolute_error"]].head()

Unnamed: 0,G3,original_predicted_g3,original_absolute_error
0,11,9.7,1.3
1,11,11.04,0.04
2,12,13.28,1.28
3,14,14.47,0.47
4,13,13.13,0.13


In [None]:
variable_labels = {
    "G3": "G3 - Mat",
    "original_predicted_g3": "Original predicted G3 - Mat",
    "original_absolute_error": "Original absolute error - Mat"
}

variables = list(variable_labels.keys())

stats_mat = {
    "Variable": [variable_labels[var] for var in variables],
    "Mean": [database_mat[var].mean() for var in variables],
    "Standard Deviation": [database_mat[var].std() for var in variables]
}

stats_table_mat = pd.DataFrame(stats_mat)

display(stats_table_mat)

Unnamed: 0,Variable,Mean,Standard Deviation
0,G3 - Mat,10.41519,4.581443
1,Original predicted G3 - Mat,10.414861,4.155055
2,Original absolute error - Mat,1.137848,1.5601


In [None]:
variable_labels = {
    "G3": "G3 - Por",
    "original_predicted_g3": "Original predicted G3 - Por",
    "original_absolute_error": "Original absolute error - Por"
}

variables = list(variable_labels.keys())

stats_por = {
    "Variable": [variable_labels[var] for var in variables],
    "Mean": [database_por[var].mean() for var in variables],
    "Standard Deviation": [database_por[var].std() for var in variables]
}

stats_table_por = pd.DataFrame(stats_por)

display(stats_table_por)

Unnamed: 0,Variable,Mean,Standard Deviation
0,G3 - Por,11.906009,3.230656
1,Original predicted G3 - Por,11.90621,2.974618
2,Original absolute error - Por,0.785285,0.985733


# Our change purpose