### Training

In [12]:
import pandas as pd

In [13]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [14]:
# Read the data from the CSV file
data = pd.read_csv("../data/ml/final.csv")

In [15]:
data.columns

Index(['user_id', 'month', 'duration', 'costo_extra_llamadas', 'mb_used',
       'costo_extra_datos', 'sms_count', 'costo_extra_sms', 'usd_monthly_pay',
       'age', 'costo_total'],
      dtype='object')

In [16]:
# Crear los lags
data['lag_costo_extra_llamadas_1'] = data['costo_extra_llamadas'].shift(1)
data['lag_costo_extra_llamadas_2'] = data['costo_extra_llamadas'].shift(2)
data['lag_costo_extra_llamadas_3'] = data['costo_extra_llamadas'].shift(3)


data['lag_costo_extra_datos_1'] = data['costo_extra_datos'].shift(1)
data['lag_costo_extra_datos_2'] = data['costo_extra_datos'].shift(2)
data['lag_costo_extra_datos_3'] = data['costo_extra_datos'].shift(3)


data['lag_costo_extra_sms_1'] = data['costo_extra_sms'].shift(1)
data['lag_costo_extra_sms_2'] = data['costo_extra_sms'].shift(2)
data['lag_costo_extra_sms_3'] = data['costo_extra_sms'].shift(3)


# Eliminar valores nulos creados por los shifts
data.dropna(subset=['lag_costo_extra_llamadas_1', 'lag_costo_extra_llamadas_2', 'lag_costo_extra_llamadas_3',
                    'lag_costo_extra_datos_1', 'lag_costo_extra_datos_2', 'lag_costo_extra_datos_3',
                    'lag_costo_extra_sms_1', 'lag_costo_extra_sms_2', 'lag_costo_extra_sms_3',], inplace=True)

In [17]:
# Actualizar X e y con los nuevos lags
X = data[['month','duration', 'mb_used',
           'sms_count', 'usd_monthly_pay', 'age', 
          'lag_costo_extra_llamadas_1', 'lag_costo_extra_llamadas_2',
          'lag_costo_extra_datos_1', 'lag_costo_extra_datos_2',
          'lag_costo_extra_sms_1', 'lag_costo_extra_sms_2',]]
y = data['costo_total']

In [18]:
# Escalar las características numéricas
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Regresión Lineal

In [19]:
model_ridge = Ridge(alpha=2.0)
model_ridge.fit(X_train, y_train)

# Predecimos
y_pred = model_ridge.predict(X_test)


In [20]:
regresion_mse = mean_squared_error(y_test, y_pred)
regresion_mae = mean_absolute_error(y_test, y_pred)
regresion_r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {regresion_mse}')
print(f'Mean Absolute Error (MAE): {regresion_mae}')
print(f'R-squared (R²): {regresion_r2}')


Mean Squared Error (MSE): 638.8203373305718
Mean Absolute Error (MAE): 19.282674229951404
R-squared (R²): 0.6426311432461458


In [23]:
# Coeficientes
print(model_ridge.coef_)

[-0.08487015 -0.00882139  0.00407523 -0.05655453  0.41915638  0.03192869
  0.31291931  0.31116883  0.13613461  0.10097539  2.96029513  0.46866786]


Se puede ver que lo que más suma al costo final es el sobre paso de mensajes sms.
Por otra parte, vemos que algunos valores no aportan mucho al costo total.  Algunos restan el costo total

## Gradient Descent

In [21]:
import numpy as np


# Convertir X e y a matrices y vectores de NumPy si no lo son
X = X.values
y = y.values

# Normalizar las características (X)
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Definir la función de costo (Mean Squared Error)
def cost_function(X, y, theta):
    m = len(y)
    predictions = X.dot(theta)
    cost = (1 / (2 * m)) * np.sum((predictions - y) ** 2)
    return cost

# Implementar Gradient Descent
def gradient_descent(X, y, theta, learning_rate, iterations):
    m = len(y)
    cost_history = []

    for i in range(iterations):
        gradients = (1 / m) * X.T.dot(X.dot(theta) - y)
        theta = theta - learning_rate * gradients
        cost_history.append(cost_function(X, y, theta))
    
    return theta, cost_history

# Agregar un término de sesgo a X
X_b = np.c_[np.ones((len(X), 1)), X]

# Inicializar parámetros
theta_initial = np.zeros(X_b.shape[1])
learning_rate = 0.01
iterations = 1000

# Ejecutar Gradient Descent
theta_optimal, cost_history = gradient_descent(X_b, y, theta_initial, learning_rate, iterations)

print(f"Optimal parameters: {theta_optimal}")


Optimal parameters: [60.46056029 -0.23253764 -1.42326266 30.68426363 -1.06121708  9.5280586
  0.50378922  1.03440452  0.52810554  7.24845595  4.0622146   0.50518361
  0.10842979]


## Stochastic Gradient Descent

In [22]:
def stochastic_gradient_descent(X, y, theta, learning_rate, iterations):
    m = len(y)
    cost_history = []

    for i in range(iterations):
        for j in range(m):
            random_index = np.random.randint(m)
            xi = X[random_index:random_index+1]
            yi = y[random_index:random_index+1]
            gradients = xi.T.dot(xi.dot(theta) - yi)
            theta = theta - learning_rate * gradients
        cost_history.append(cost_function(X, y, theta))
    
    return theta, cost_history

# Ejecutar Stochastic Gradient Descent
theta_optimal_sgd, cost_history_sgd = stochastic_gradient_descent(X_b, y, theta_initial, learning_rate, iterations)

print(f"Optimal parameters (SGD): {theta_optimal_sgd}")


Optimal parameters (SGD): [60.12573562  1.14712319  0.98121607 35.81743476 -4.00042607  2.28104565
 -4.44631066  3.25021609 -1.49754111 12.40785707  3.87583293 -4.62178717
  3.12033621]
