Librerías necesarias

In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from scipy.spatial.distance import cdist
import statsmodels.formula.api as smf
from sklearn.neighbors import NearestNeighbors
import statsmodels.api as sm

from sklearn.metrics import root_mean_squared_error

Funciones auxiliares

In [2]:
class mean_KNN:
    def __init__(self, K_0):
        self.K_0 = K_0
    
    def fit(self, X, y):
        self.X = X.reset_index(drop=True)
        self.y = y.reset_index(drop=True)
        
        self.nbrs = NearestNeighbors(n_neighbors=self.K_0+1, algorithm='auto').fit(self.X)
        XX = np.array([(1, self.y[row].mean()) for row in self.nbrs.kneighbors(self.X)[1][:,1:]])
        self.model = sm.OLS(self.y, XX).fit()

    def predict(self, X):
        XX = np.array([(1, self.y[row].mean()) for row in self.nbrs.kneighbors(X)[1][:,:-1]])
        return self.model.predict(XX)

Importamos datos

In [3]:
path = 'https://raw.githubusercontent.com/AlejandroVillazonG/MAT467/main/tareas/T1/anomalies.txt'
data = pd.read_csv(path, sep = '\t')
data.columns = ['s_x', 's_y', 'X']
data.index -= 1
data.head()

Unnamed: 0,s_x,s_y,X
0,-85.25,31.57,-0.458687
1,-87.42,32.23,-0.925328
2,-85.87,32.98,-0.437082
3,-88.13,33.13,-0.602672
4,-86.5,31.32,-0.351995


Seleccionamos el mejor modelo mediante $K$-fold con $K=10$

In [4]:
K = 10
kf = KFold(n_splits=K, shuffle=True, random_state=42)

formula_1 = 'X ~ s_x + s_y + np.power(s_x, 2) + np.power(s_y, 2) + s_x * s_y'
K_0 = 10

In [5]:
rmses_1 = []
rmses_2 = []

for train_index, test_index in kf.split(data):
    train_data, test_data = data.iloc[train_index], data.iloc[test_index]

    model_1 = smf.ols(formula=formula_1, data=train_data).fit()
    y_pred_1 = model_1.predict(test_data)
    rmses_1.append(root_mean_squared_error(y_pred_1, test_data['X']))

    model_2 = mean_KNN(K_0 = K_0)
    model_2.fit(train_data.drop('X', axis=1), train_data['X'])
    y_pred_2 = model_2.predict(test_data.drop('X', axis=1))
    rmses_2.append(root_mean_squared_error(y_pred_2, test_data['X']))

In [6]:
mean_rmse_1 = np.mean(rmses_1)
mean_rmse_2 = np.mean(rmses_2)
print("mean RMSE model1:", mean_rmse_1)
print("mean RMSE model2:", mean_rmse_2)

mean RMSE model1: 0.8408861201818741
mean RMSE model2: 0.4974716615816866
