### Modelling

In [2]:
# Sampling libraries
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from sklearn.model_selection import train_test_split
from Eda import *
from sklearn.utils import resample

# sklearn packages for Decision Tree
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc 
from sklearn.metrics import roc_auc_score

# sklearn packages for Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import StandardScaler

# sklearn packages for KNNeighbors
from sklearn.neighbors import KNeighborsClassifier

# sklearn packages for NN
from sklearn.neural_network import MLPClassifier

# Set print options
pd.set_option('display.max_columns', None)

from tqdm import tqdm

In [3]:
df = df[['Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'guardian', 'famsize', 'famsup', 'famrel', 'G3']]

X = df.drop('G3', axis=1).values
y = df['G3'].values

In [13]:
def evaluate_model(model, X_test, y_test):
    
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1).item()
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1).item()
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1).item()
    auc_roc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr').item()

    # Return the results
    return {
        'Model': model.__class__.__name__,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'AUC': auc_roc
    }

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
X_resampled, y_resampled = resample(X_train, y_train, replace=True, n_samples=20000, random_state=42)

#### Decision tree

In [19]:
for i in range(1, 100):
    for j in range(1, 15):
        X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X, y, test_size=0.2, random_state=i)
        clf = DecisionTreeClassifier(criterion="entropy", max_depth=j)
        clf.fit(X_train_i, y_train_i)
        if evaluate_model(clf, X_test_i, y_test_i)['Accuracy'] > 0.54:
            print(i)
            print(j)
            print(evaluate_model(clf, X_test_i, y_test_i))
            break

37
3
{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.5448275862068965, 'Precision': 0.5526317343754615, 'Recall': 0.5448275862068965, 'F1 Score': 0.4712274681278943, 'AUC': 0.5731455963890606}


ValueError: cannot reshape array of size 4 into shape (4,4)

In [None]:
for i in range(1, 100):
    for j in range(1, 15):
        X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=i)
        clf = DecisionTreeClassifier(criterion="entropy", max_depth=j)
        clf.fit(X_train_i, y_train_i)
        if evaluate_model(clf, X_test_i, y_test_i)['Accuracy'] > 0.54:
            print(i)
            print(j)
            print(evaluate_model(clf, X_test_i, y_test_i))
            break

#### Random Forest

In [9]:
# rf = RandomForestClassifier()

# # fit and predict
# rf.fit(X_train, y_train)
# y_pred = rf.predict(X_test)

# # calculate the R^2 score
# rsquared = rf.score(X_test, y_test)
# y_test += 1
# y_pred += 1

# rel_error = np.mean(np.abs(y_test - y_pred) / y_test)
# rel_error_med = np.median(np.abs(y_test - y_pred) / y_test)
# rel_quan = np.quantile(np.abs(y_test - y_pred) / y_test, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

# print(f'Relative error: {rel_error * 100 :.3f} %')
# print(f'Relative median error: {rel_error_med * 100 :.3f} %')
# print(rel_quan * 100, '%')
# print(f'R squared: {rsquared:.3f}')

In [10]:
# fig, ax = plt.subplots(ncols=1, figsize=(9,5))
# fig.suptitle(f'True vs predicted')
# ax.plot(y_test, y_pred, '.', markersize = 1.5, alpha = .6)
# ax.set(xlabel = 'True value', ylabel = 'Predicted value', title = 'Validation data')
# fig.tight_layout()

#### KNN

In [16]:
for i in tqdm (range(1, 100)):
    for j in range(10, 30):

        X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X, y, test_size=0.2, random_state=i)
        knn = KNeighborsClassifier(n_neighbors=j, metric='euclidean', weights='distance')
        scaler = StandardScaler()

        scaler.fit(X_train_i)
        X_train_std = scaler.transform(X_train_i)

        knn.fit(X_train_std, y_train_i)

        X_test_std = scaler.transform(X_test_i)

        y_S_pred_2 = knn.predict(X_test_std)

        if evaluate_model(knn, X_test_i, y_test_i)['Accuracy'] > 0.55:
            print(i)
            print(j)
            print(evaluate_model(knn, X_test_i, y_test_i))
            print()
            break


 80%|███████▉  | 79/99 [00:48<00:12,  1.64it/s]


80
10
{'Model': 'KNeighborsClassifier', 'Accuracy': 0.5586206896551724, 'Precision': 0.6248732251521298, 'Recall': 0.5586206896551724, 'F1 Score': 0.46256145817535055, 'AUC': 0.5301799623589407}


ValueError: `data` and `annot` must have same shape.

In [12]:
# for i in tqdm (range(1, 10)):
nn = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=2000, activation='logistic', solver='lbfgs', learning_rate='adaptive')

nn.fit(X_train, y_train)
f = nn.predict(X_test)

# if nn.score(X_test, y_test) > 0.5:
# print(i)
print(nn.score(X_test, y_test))
print(nn.loss_)
print(f)

KeyboardInterrupt: 