In [4]:

#Se importa numpy para facilitar las operaciones con matrices
import random
import math
import numpy as np

#Descenso estocástico de gradiente para regresión lineal.
#training_data_X es una matriz que representa el conjunto de x_i's
#training_data_y es una matriz que representa el conjunto de y_i's. Esta matriz tiene únicamente una 
#Pre: training_data_X  y training_data_y deben tener el mismo orden, es decir, para todo elemento X_i 
# de la matriz training_data_X, y_i en la matriz training_data_y debe ser su respectiva pareja.
# ejemplo, la fila 0 de training_data_X tiene como respectivo valor y el que está dado por 
# la fila 0 de training_data_y
def sgd(training_data_X, training_data_y,learning_rate= 0.001, max_iterations = 10000):
    
    #Para evitar que la embarre y ponga un learning rate demasiado alto que lleve a que el método no converja 
    if(learning_rate > 0.0001):
        learning_rate = 0.0001
    
    #Se inicializa W en ceros, de dimensión (columnas X, 1).
    #Se hace así para que no se tenga que transponer W.
    W = np.zeros(shape = (1, len(training_data_X[0])))
    
    iterations = 0
    no_improv_it = 0
    last_error = 0
    new_error = 0
    
    #Se estimará el gradiente a partir, únicamente, de uno de los datos
    while iterations < max_iterations and no_improv_it < 100 :
        
        #Seleccionar un dato al azar.
        i = math.floor(random.random()*len(training_data_X))
    
        #Calcular error del dato seleccionado al azar.
        e = np.dot(training_data_X[i], W.transpose())[0] - training_data_y[i]
        #Estimar gradiente con el error anterior
        estimated_grad = np.multiply(e, training_data_X[i])
        
        W = W - np.multiply(learning_rate, estimated_grad)
        
        new_error = residual_sum_of_squares(W,training_data_X, training_data_y)
    
        #Si no hubo una mejora en el error cuadrático, reducir tasa de aprendizaje (puede que se esté "saltando" el punto óptimo)
        if last_error < new_error:
            learning_rate*=0.9
            no_improv_it = 0
        elif last_error == new_error:
            no_improv_it += 1
        else:
            no_improv_it = 0
        
        last_error = new_error
        iterations += 1        
    
    print("----------------------------------------------")
    print("SDG finished, best residual_sum_of_squares %s" % last_error)
    print("W found: %s" % W)
    print(" R^2 found for the model: %s" % rcuadrado(W, training_data_X, training_data_y))

def sgd_large_dataset(training_data_X, training_data_y,learning_rate= 0.001, max_iterations = 10000):
    
    #Para evitar que la embarre y ponga un learning rate demasiado alto que lleve a que el método no converja 
    if(learning_rate > 0.0001):
        learning_rate = 0.0001
    
    #Se inicializa W en ceros, de dimensión (columnas X, 1).
    #Se hace así para que no se tenga que transponer W.
    W = np.zeros(shape = (1, len(training_data_X[0])))
    
    iterations = 0
    no_improv_it = 0
    last_error = 0
    new_error = 0
    
    #Se estimará el gradiente a partir, únicamente, de uno de los datos
    while iterations < max_iterations and no_improv_it < 100 :
        
        #Seleccionar un dato al azar.
        i = math.floor(random.random()*len(training_data_X))
    
        #Calcular error del dato seleccionado al azar.
        e = np.dot(training_data_X[i], W.transpose())[0] - training_data_y[i]
        #Estimar gradiente con el error anterior
        estimated_grad = np.multiply(e, training_data_X[i])
        
        W = W - np.multiply(learning_rate, estimated_grad)
            
        #Si no hubo una mejora en el error cuadrático, reducir tasa de aprendizaje (puede que se esté "saltando" el punto óptimo)
        if last_error < e:
            learning_rate*=0.9
            no_improv_it = 0
        elif last_error == e:
            no_improv_it += 1
        else:
            no_improv_it = 0
        
        last_error = e
        iterations += 1        
    
    print("----------------------------------------------")
    print("SDG finished, best residual_sum_of_squares %s" % last_error)
    print("W found: %s" % W)
    print(" R^2 found for the model: %s" % rcuadrado(W, training_data_X, training_data_y))
           
    
def rcuadrado(W, test_data_X, test_data_y):
    
    residual_sum = residual_sum_of_squares(W, test_data_X, test_data_y)
    
    mean = np.mean(test_data_y)
        
    total_sum_squares = 0
    for i in range(0, len(test_data_y)):
        total_sum_squares +=  math.pow(test_data_y[i] - mean,2)
    
    print(residual_sum)
    print(total_sum_squares)

    
    return 1 - residual_sum/total_sum_squares

    
        
#Calcula el error de la función, se usa sólo una vez por iteración.
def residual_sum_of_squares(W, training_data_X, training_data_y):
    
    sum = 0
    for i in range(0, len(training_data_X)):
        sum+= math.pow(training_data_y[i] - np.dot(training_data_X[i] , W.transpose())[0],2)
    
    return 0.5 * sum

**Pruebas rápidas del método:**

In [2]:
data_matrix = np.loadtxt(open("./WineQuality/winequality-white.csv", "rb"), delimiter=";", skiprows=1)
print("Filas de la matriz: " + str(len(data_matrix)))
print("Columnas de la matriz: " + str(len(data_matrix[0])))

X = np.resize(data_matrix, (len(data_matrix), len(data_matrix[0])-1))
y = data_matrix[:,11]

print(X)
print(y)

Filas de la matriz: 4898
Columnas de la matriz: 12
[[ 7.       0.27     0.36    ...  3.       0.45     8.8    ]
 [ 6.       6.3      0.3     ...  0.994    3.3      0.49   ]
 [ 9.5      6.       8.1     ... 97.       0.9951   3.26   ]
 ...
 [ 0.13     0.28     0.9     ...  0.52    11.2      6.     ]
 [ 6.7      0.48     0.49    ...  3.13     0.4     13.     ]
 [ 6.       6.7      0.48    ...  0.98926  3.13     0.4    ]]
[6. 6. 6. ... 6. 7. 6.]


In [5]:
sgd(X, y)

----------------------------------------------
SDG finished, best residual_sum_of_squares 12385.973466999734
W found: [[0.04022056 0.01843098 0.02762465 0.04211826 0.01571573 0.02872201
  0.02644516 0.02270781 0.03372551 0.02253315 0.02959164]]
12385.973466999734
3840.989791751859
 R^2 found for the model: -2.2246827350589093


**Pruebas con un set de datos que sé que sí sirve para regresión lineal**

In [272]:
import mglearn
X, y = mglearn.datasets.make_wave(n_samples=5000)
sgd(X, y)

----------------------------------------------
SDG finished, best residual_sum_of_squares 2473.2461019723005
W found: [[0.00847107]]
2473.2461019723005
5060.6524237669555
 R^2 found for the model: 0.5112791998207791


In [8]:
data_matrix = np.loadtxt(open("./szeged-weather/weatherHistory.csv", "rb"), delimiter=",", skiprows=1)
print("Filas de la matriz: " + str(len(data_matrix)))
print("Columnas de la matriz: " + str(len(data_matrix[0])))

X = np.resize(data_matrix, (len(data_matrix), len(data_matrix[0])-1))
y = data_matrix[:,len(data_matrix[0])-1]

from sklearn.preprocessing import normalize

X = normalize(X)

print(X)
print(y)

Filas de la matriz: 96453
Columnas de la matriz: 8
[[9.05579391e-03 1.34989541e-02 2.39965260e-01 ... 0.00000000e+00
  9.70501732e-01 7.06405038e-03]
 [8.48918419e-04 8.92371172e-03 1.36061592e-02 ... 1.50957726e-02
  0.00000000e+00 9.68749454e-01]
 [3.52695969e-02 4.19656694e-03 4.57610142e-02 ... 9.95464717e-01
  7.29856187e-02 0.00000000e+00]
 ...
 [9.00047384e-02 3.31159719e-03 9.73277692e-02 ... 9.73999173e-01
  1.00779045e-01 0.00000000e+00]
 [9.87377024e-01 1.45763744e-02 4.47340179e-04 ... 2.24520036e-02
  1.54624105e-01 1.47174919e-02]
 [0.00000000e+00 9.90067869e-01 1.56932152e-02 ... 1.67556919e-02
  2.40355007e-02 1.36604153e-01]]
[0.89 0.86 0.89 ... 0.56 0.6  0.61]


In [9]:
from sklearn.linear_model import LinearRegression

sgd_large_dataset(X, y)
print(LinearRegression().fit(X, y).score(X,y))

----------------------------------------------
SDG finished, best residual_sum_of_squares -0.2996184730085636
W found: [[0.0003061  0.00015195 0.00029239 0.00027839 0.00043315 0.00023312
  0.00018452]]
27865.096466011073
3685.391540427899
 R^2 found for the model: -6.560959577927436
0.0003938931136273638


In [19]:
data_matrix = np.loadtxt(open("./bostonhoustingmlnd/housing.csv", "rb"), delimiter=",", skiprows=1)
print("Filas de la matriz: " + str(len(data_matrix)))
print("Columnas de la matriz: " + str(len(data_matrix[0])))

X = np.resize(data_matrix, (len(data_matrix), len(data_matrix[0])-1))
y = data_matrix[:,len(data_matrix[0])-1]

from sklearn.preprocessing import normalize

X = normalize(X)

print(X)
print(y)

Filas de la matriz: 489
Columnas de la matriz: 4
[[3.78272947e-01 2.86509395e-01 8.80239708e-01]
 [1.00000000e+00 1.27400794e-05 1.81349206e-05]
 [3.92416225e-05 9.99999999e-01 1.58399471e-05]
 ...
 [8.82481429e-05 9.99999996e-01 2.41852336e-05]
 [9.94521694e-05 8.51243145e-05 9.99999991e-01]
 [1.71027880e-01 7.60950132e-01 6.25862893e-01]]
[ 504000.  453600.  728700.  701400.  760200.  602700.  480900.  569100.
  346500.  396900.  315000.  396900.  455700.  428400.  382200.  417900.
  485100.  367500.  424200.  382200.  285600.  411600.  319200.  304500.
  327600.  291900.  348600.  310800.  386400.  441000.  266700.  304500.
  277200.  275100.  283500.  396900.  420000.  441000.  518700.  646800.
  732900.  558600.  531300.  518700.  445200.  405300.  420000.  348600.
  302400.  407400.  413700.  430500.  525000.  491400.  396900.  743400.
  518700.  663600.  489300.  411600.  392700.  336000.  466200.  525000.
  693000.  493500.  407400.  462000.  365400.  438900.  508200.  455700.


In [20]:

sgd(X, y)
print(LinearRegression().fit(X, y).score(X,y))

----------------------------------------------
SDG finished, best residual_sum_of_squares 35410831518667.43
W found: [[ 85860.20149609  95549.2304364  110783.67523526]]
35410831518667.43
13340654818159.514
 R^2 found for the model: -1.6543548275055913
0.004591194163290235


In [3]:
data_matrix = np.loadtxt(open("./BostonHousing_preprocessed.csv", "r", encoding="utf-8"), delimiter=",", skiprows=1)
print("Filas de la matriz: " + str(len(data_matrix)))
print("Columnas de la matriz: " + str(len(data_matrix[0])))
X = np.resize(data_matrix, (len(data_matrix), len(data_matrix[0])-1))
y = data_matrix[:,len(data_matrix[0])-1]
print(X)
print(y)


Filas de la matriz: 505
Columnas de la matriz: 14
[[2.73100e-02 0.00000e+00 7.07000e+00 ... 1.78000e+01 3.96900e+02
  9.14000e+00]
 [2.16000e+01 2.72900e-02 0.00000e+00 ... 2.42000e+02 1.78000e+01
  3.92830e+02]
 [4.03000e+00 3.47000e+01 3.23700e-02 ... 3.00000e+00 2.22000e+02
  1.87000e+01]
 ...
 [1.81000e+01 0.00000e+00 5.84000e+02 ... 2.13200e+01 1.91000e+01
  1.55757e+01]
 [0.00000e+00 1.81000e+01 0.00000e+00 ... 3.68740e+02 1.81300e+01
  1.91000e+01]
 [1.30751e+01 0.00000e+00 1.81000e+01 ... 2.02000e+01 3.96900e+02
  1.47600e+01]]
[21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4 18.2
 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8 18.4
 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6 25.3
 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4 24.7
 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9 24.2
 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9 23.9
 26.6 22.5 22.2 23.6 28.7 22.6 2

In [5]:
from sklearn.linear_model import LinearRegression

sgd(X, y)
print(LinearRegression().fit(X, y).score(X,y))

----------------------------------------------
SDG finished, best residual_sum_of_squares 1.0979158658993548e+116
W found: [[-6.44744552e+52  1.06029957e+53  1.73609848e+53 -3.56394706e+52
   3.65127475e+52 -9.47775179e+52  2.53179660e+53 -5.70792112e+51
  -8.37659693e+52  1.81109360e+52  1.69085492e+53 -9.62280881e+52
   3.88578166e+51]]
1.0979158658993548e+116
42714.138495049454
 R^2 found for the model: -2.570380451490559e+111
0.005382807398750189
