In [3]:
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 27 15:05:08 2024

@author: JMGC2008
"""
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score,make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

def abrirArchivo(fileR):
    #https://archive.ics.uci.edu/dataset/20/census+income
    head=["age","workclass","fnlwgt","edu","edu-num","mar-sta","occ","rela","race","sex","cap-gain","cap-loss","hpw","country","outcome"]
    f=pd.read_csv(fileR,sep=',')
    #agregar encabezado
    f.columns=head
	#substitute missing values with NaN
    f.replace(' ?',np.nan,inplace=True)
    #replace NaN with the mode
    #https://github.com/pandas-dev/pandas/issues/9750
    f = f.fillna(f.mode().iloc[0])
    pd.set_option('display.max_columns', None)
    X,y=process(f)
    
    print(y)
    #nn2(X,y)
    nnCV(X,y)
    #gridsearch(X, y)
    
    
def process(f):
    #all columns except the last one
    X=f.iloc[:,:-1]
    y=f["outcome"]
    ordList=["sex"]
    oheList=["workclass","edu","mar-sta","occ","rela","race","country"]
    X=preproc(X,ordList)
    X=oneHot(X,oheList)
    #print (X)
    #scaling
    x=X[["age","fnlwgt","edu-num","cap-gain","cap-loss","hpw"]].values.astype(int)
    min_max_scaler=preprocessing.MinMaxScaler()
    x_scaled=min_max_scaler.fit_transform(x)
    X[["age","fnlwgt","edu-num","cap-gain","cap-loss","hpw"]]=x_scaled
    #print(X)
    #the output should be encoded in the form of 0 and 1
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    return X,y_encoded
	
def preproc(f,indices):
    #indice debería ser una lista []
    pp=preprocessing.OrdinalEncoder()
    #double [], inside for the list and outside for the index
    #f[["country"]]=pp.fit_transform(f[["country"]])
    f[indices]=pp.fit_transform(f[indices])
    return f

def oneHot(f, indices):
    f = pd.concat([f, pd.get_dummies(f[indices])], axis=1)
    #f=f.drop(indices,1) original
    f=f.drop(indices,axis=1)  #cambio version
    return f	

def compute_test_loss(mlp, X_test, y_test):
    test_loss_curve = []
    for epoch in range(mlp.max_iter):
        # Make predictions on the test set
        y_pred_proba = mlp.predict_proba(X_test)
        # Compute the negative log likelihood loss (cross-entropy loss) for binary classification
        positive_probs = y_pred_proba[:, 1][y_test == 1]
        test_loss = -np.mean(np.log(positive_probs))  # Use binary classification index 1 for positive class
        test_loss_curve.append(test_loss)
        # Perform one epoch of training (optional)
        mlp.partial_fit(X_test, y_test)
    return test_loss_curve


	
def nn2(X,y):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size = 0.3)
    #mlp=MLPClassifier(activation='relu', batch_size=10, hidden_layer_sizes=50,learning_rate="constant",max_iter=1000,solver="adam")
    #https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
    mlp=MLPClassifier(activation='tanh', hidden_layer_sizes=(100,),learning_rate_init=0.001,momentum= 0.99,max_iter=1000,solver="sgd")
    
    print ("============train================")
    print (mlp.fit(X_train,y_train))
    print (mlp.score(X_train,y_train))
    
    train_loss_curve = mlp.loss_curve_
    
    predictions=mlp.predict(X_train)
    print(confusion_matrix(y_train,predictions))
    
    
    print ("============test================")
    predictions=mlp.predict(X_test)
    print(confusion_matrix(y_test,predictions))
    print (mlp.score(X_test,y_test))
    
    #mlp.fit(X_test,y_test)
    #val_scores_curve = mlp.validation_scores_ #plot the accuracy scores, early_stopping=True
    #test_loss_curve = mlp.loss_curve_
    test_loss_curve = compute_test_loss(mlp, X_test, y_test)
    
    
    plt.plot(train_loss_curve,color='blue', linestyle='-',label='Training Loss')
    plt.plot(test_loss_curve,color='orange', linestyle='--',label='Test Loss')
    #plt.plot(val_scores_curve,color='red', linestyle='-',label='Validation Scores')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()


originalclass = []
predictedclass = []

def classification_report_with_accuracy_score(y_true, y_pred):
    originalclass.extend(y_true)
    predictedclass.extend(y_pred)
    #return accuracy_score(y_true, y_pred) # return accuracy score
    
    # Calculate accuracy score
    accuracy = accuracy_score(y_true, y_pred)
    
    return accuracy

    

def nnCV(X,y):
    mlp=MLPClassifier(activation='relu',batch_size=10,hidden_layer_sizes=50,learning_rate="constant",max_iter=250,solver="adam")
    nested=cross_val_score(mlp, X,y,cv=5,scoring=make_scorer(classification_report_with_accuracy_score))
    print(nested) #entre 5 a 10 el valor de cv

def gridsearch(X, y):
    # Define the parameter grid to search
    param_grid = {
        'hidden_layer_sizes': [(50,),(100,),(50,50),(100,50)],
        'activation': ['tanh', 'relu','logistic'],
    }
    mlp=MLPClassifier(max_iter=1000)
    grid_search = GridSearchCV(estimator=mlp,param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X, y)
    print("Best Parameters:", grid_search.best_params_)
    print("Best Score:", grid_search.best_score_)

    #Best Parameters: {'activation': 'tanh', 'hidden_layer_sizes': (50,), 'learning_rate_init': 0.001, 'momentum': 0.99, 'solver': 'sgd'}
    #Best Score: 0.8545147420147419
    
    #Best Parameters: {'activation': 'tanh', 'hidden_layer_sizes': (100,), 'learning_rate_init': 0.001, 'momentum': 0.99, 'solver': 'sgd'}


      
def main():
    f="adult.data"
    abrirArchivo(f)

if __name__=="__main__":
    main()


[0 0 0 ... 0 0 1]
[0.83445946 0.84305897 0.84060197 0.84459459 0.83937346]
