# Artigo 1

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, f1_score
from imblearn.over_sampling import RandomOverSampler
from tabulate import tabulate
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, train_test_split
import joblib
import pickle





# Carregar os datasets
portugal_data = pd.read_csv('student-por.csv', sep=';')
mathematics_data = pd.read_csv('student-mat.csv', sep=';')

portugal_data['Course'] = 'P'
mathematics_data['Course'] = 'M'

combined_data = pd.concat([portugal_data, mathematics_data], ignore_index=True)

# Calcular a média das colunas G1, G2 e G3
combined_data['result'] = (combined_data[['G1', 'G2', 'G3']].mean(axis=1).round(2) > 10).astype(int)

# Filtrar os alunos que passaram (média maior que 10)
passed_students = combined_data[combined_data['result'] > 10]

# Filtrar os alunos que não passaram (média menor ou igual a 10)
failed_students = combined_data[combined_data['result'] <= 10]


def converter_nota(nota):
    if nota > 10:
        return 1
    else:
        return 0

# Verificar a nova distribuição dos alunos que passaram após o SMOTE
print("Distribuição após oversampling dos alunos que passaram:")

combined_data[['G1', 'G2', 'G3']] = combined_data[['G1', 'G2', 'G3']].applymap(converter_nota)


combined_data['higher'].head(100)

Distribuição após oversampling dos alunos que passaram:


  combined_data[['G1', 'G2', 'G3']] = combined_data[['G1', 'G2', 'G3']].applymap(converter_nota)


0     yes
1     yes
2     yes
3     yes
4     yes
     ... 
95    yes
96    yes
97    yes
98    yes
99    yes
Name: higher, Length: 100, dtype: object

In [46]:
label_encoder = LabelEncoder()

for col in combined_data:
    combined_data[col] = label_encoder.fit_transform(combined_data[col])

X = combined_data.drop(columns=['G3','result','Course'])
y = combined_data['G3']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


dt = DecisionTreeClassifier(random_state=42)
ada_boost = AdaBoostClassifier(estimator=dt, n_estimators=50, random_state=42)  # Use "estimator" em vez de "base_estimator"
ada_boost.fit(X_train, y_train)
y_pred_ada = ada_boost.predict(X_test)



print("Resultados do AdaBoost com J48:")
print(classification_report(y_test, y_pred_ada))
print(f"Acurácia: {accuracy_score(y_test, y_pred_ada)*100:.2f}%")
print(f"Precisão: {precision_score(y_test, y_pred_ada)*100:.2f}%")
print(f"f1-score: {f1_score(y_test, y_pred_ada)*100:.2f}%")
combined_data['higher'].head(100)



Resultados do AdaBoost com J48:
              precision    recall  f1-score   support

           0       0.75      0.80      0.78        41
           1       0.87      0.83      0.85        64

    accuracy                           0.82       105
   macro avg       0.81      0.82      0.81       105
weighted avg       0.82      0.82      0.82       105

Acurácia: 81.90%
Precisão: 86.89%
f1-score: 84.80%


0     1
1     1
2     1
3     1
4     1
     ..
95    1
96    1
97    1
98    1
99    1
Name: higher, Length: 100, dtype: int64

###Balancemento da Classe 'result'

In [47]:
combined_data.groupby('result').size()

oversample = SMOTE(sampling_strategy = 1)

X_over, y_over = oversample.fit_resample(X_train, y_train)

print(pd.Series(y_over).value_counts())


ada_boost = AdaBoostClassifier(estimator=dt, n_estimators=50, random_state=42)
ada_boost.fit(X_over, y_over)
y_pred_ada = ada_boost.predict(X_test)

print("Resultados do AdaBoost com J48:")
print(classification_report(y_test, y_pred_ada))
print(f"Acurácia: {accuracy_score(y_test, y_pred_ada)*100:.2f}%")
print(f"Precisão: {precision_score(y_test, y_pred_ada)*100:.2f}%")
print(f"f1-score: {f1_score(y_test, y_pred_ada)*100:.2f}%")


G3
1    597
0    597
Name: count, dtype: int64




Resultados do AdaBoost com J48:
              precision    recall  f1-score   support

           0       0.75      0.80      0.78        41
           1       0.87      0.83      0.85        64

    accuracy                           0.82       105
   macro avg       0.81      0.82      0.81       105
weighted avg       0.82      0.82      0.82       105

Acurácia: 81.90%
Precisão: 86.89%
f1-score: 84.80%


###Feature selection

In [48]:
selector = SelectKBest(score_func=mutual_info_classif, k='all')
X1 = combined_data.drop(columns=['G3','result','Course'])

y = combined_data['G3']
X_new = selector.fit_transform(X1, y)

feature_scores = pd.DataFrame({
    'Feature': X1.columns,
    'Score': selector.scores_
})

feature_scores = feature_scores.sort_values(by='Score', ascending=False)

new_features = feature_scores.head(12)['Feature'].values

new_features

array(['G2', 'G1', 'failures', 'higher', 'Mjob', 'age', 'Walc',
       'studytime', 'Medu', 'goout', 'sex', 'Fedu'], dtype=object)

### Treinando modelo com os novos features

In [49]:
#Separando dados de teste com X_over(dados de treino depois do balanceamento)
#e new_features(12 melhores atributos do dataset depois do método de ganho de informação)
#new_features = ['G2', 'G1', 'failures', 'Medu', 'paid', 'Fedu', 'Fjob', 'Mjob',
#'guardian', 'age', 'schoolsup', 'higher']
#a cada execução do feature selection, conjuntos de features diferentes são mostradas, acima estão as
#que resultaram o melhor modelo.
newTest = X_over[new_features]

X_train, X_test, y_train, y_test = train_test_split(newTest, y_over, test_size=0.1, random_state=42)

ada_boost.fit(X_train, y_train)
y_pred_ada = ada_boost.predict(X_test)

print("Resultados do AdaBoost com J48:")
print(classification_report(y_test, y_pred_ada))
print(f"Acurácia: {accuracy_score(y_test, y_pred_ada)*100:.2f}%")
print(f"Precisão: {precision_score(y_test, y_pred_ada)*100:.2f}%")
print(f"f1-score: {f1_score(y_test, y_pred_ada)*100:.2f}%")



Resultados do AdaBoost com J48:
              precision    recall  f1-score   support

           0       0.86      0.94      0.90        51
           1       0.95      0.88      0.92        69

    accuracy                           0.91       120
   macro avg       0.91      0.91      0.91       120
weighted avg       0.91      0.91      0.91       120

Acurácia: 90.83%
Precisão: 95.31%
f1-score: 91.73%


### Pegando os melhores hiperparametros com gridSearch

In [50]:
# newTest = X_over[new_features]
# X_train, X_test, y_train, y_test = train_test_split(newTest, y_over, test_size=0.1, random_state=42)

# ada_boost = AdaBoostClassifier(random_state=42)

# param_grid = {
#     'n_estimators': [50, 100, 150],
#     'learning_rate': [0.1, 0.5, 1.0],
#     'algorithm': ['SAMME', 'SAMME.R']
# }

# grid_search = GridSearchCV(estimator=ada_boost, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# grid_search.fit(X_train, y_train)

# best_model = grid_search.best_estimator_

# y_pred_ada = best_model.predict(X_test)

# print("Resultados do AdaBoost com os melhores hiperparâmetros:")
# print(classification_report(y_test, y_pred_ada))
# print(f"Acurácia: {accuracy_score(y_test, y_pred_ada)*100:.2f}%")
# print(f"Precisão: {precision_score(y_test, y_pred_ada)*100:.2f}%")
# print(f"f1-score: {f1_score(y_test, y_pred_ada)*100:.2f}%")

# print("Melhores hiperparâmetros:", grid_search.best_params_)


### Modelo final

In [68]:
new_features = ['G2', 'G1', 'failures', 'Medu', 'paid', 'Fedu', 'Fjob', 'Mjob', 'guardian', 'age', 'schoolsup', 'higher']

newTest = X_over[new_features]

X_train, X_test, y_train, y_test = train_test_split(newTest, y_over, test_size=0.1, random_state=42)

#Melhores hiperparametros encontrados para o adaBoost
ada_boost = AdaBoostClassifier(algorithm='SAMME.R', learning_rate=0.5, n_estimators=50, random_state=42)

ada_boost.fit(X_train, y_train)

y_pred_ada = ada_boost.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_ada)
print(f"Acurácia do modelo AdaBoost: {accuracy * 100:.2f}%")

joblib.dump(ada_boost, 'modelo_adaBoost.pkl')

print(ada_boost)
results_df = pd.DataFrame({'Real': y_test, 'Previsto': y_pred_ada})
results_df.head(10)



Acurácia do modelo AdaBoost: 96.67%
AdaBoostClassifier(learning_rate=0.5, random_state=42)
school         1
sex            1
age            2
address        1
famsize        0
Pstatus        1
Medu           1
Fedu           2
Mjob           2
Fjob           2
reason         0
guardian       0
traveltime     0
studytime      0
failures       1
schoolsup      0
famsup         1
paid           0
activities     1
nursery        1
higher         0
internet       1
romantic       1
famrel         4
freetime       2
goout          4
Dalc           4
Walc           4
health         0
absences      12
G1             0
G2             0
G3             0
Course         1
result         0
Name: 500, dtype: int64
