### Tarea Regresión

In [1]:
import pandas as pd
import numpy as np

### Dataset

In [2]:
data = "https://raw.githubusercontent.com/bigdatadatafan/datasets-clase/main/car_fuel_efficiency.csv"
df = pd.read_csv(data)
df = df[["engine_displacement","horsepower","vehicle_weight", "model_year", "fuel_efficiency_mpg"]]
df

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.870990,2009,12.488369
...,...,...,...,...,...
9699,140,164.0,2981.107371,2013,15.101802
9700,180,154.0,2439.525729,2004,17.962326
9701,220,138.0,2583.471318,2008,17.186587
9702,230,177.0,2905.527390,2011,15.331551


### Pregunta 1: Missing Values

In [3]:
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

### Pregunta 2: Horsepower Median

In [4]:
df["horsepower"].median()

np.float64(149.0)

### FillNa

In [5]:
df["horsepower"] = df.horsepower.fillna(df["horsepower"].mean())

### Split

In [6]:
n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

### Pregunta 3: Llenando NAs

In [7]:
def linear_regression(X, y):
    full_ones = np.ones(X.shape[0])
    X = np.column_stack([full_ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    # Obtenemos el peso por por el producto de la inversa, traspuestas y el producto vectorial
    # de la característica
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

def set_X(df, fillna_value):
    df = df.fillna(fillna_value)
    X = df.values
    return X

def rmse_raw(y, y_pred):
    err = y_pred - y
    mse = (err ** 2).mean()
    return np.sqrt(mse)

In [8]:
# Usando el 0

X_train = set_X(df_train, fillna_value=0)
w_0, w = linear_regression(X_train, y_train)

X_val = set_X(df_val, fillna_value=0)
y_pred = w_0 + X_val.dot(w)

rmse_raw(y_val, y_pred)

np.float64(0.46360196121428665)

In [9]:
# Usando la mediana

mean = df_train.horsepower.mean()

X_train = set_X(df_train, fillna_value=mean)
w_0, w = linear_regression(X_train, y_train)

X_val = set_X(df_val, fillna_value=mean)
y_pred = w_0 + X_val.dot(w)

rmse_raw(y_val, y_pred)

np.float64(0.46360196121428665)

### Pregunta 4: Mejor Regularización

In [10]:
def linear_regression_reg(X, y, r=0.0):
    full_ones = np.ones(X.shape[0])
    X = np.column_stack([full_ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [12]:
for r in [0, 0.01, 1, 10, 100]:
    w_0, w = linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    rmse_val = rmse_raw(y_val, y_pred)
    print('%06s %0.5f' % (r, rmse_val))

     0 0.46360
  0.01 0.46337
     1 0.46949
    10 0.47035
   100 0.47044


### Pregunta 5: RMSE Standard Deviation

In [14]:
rmses = []

for s in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:

    n = len(df)

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.seed(s)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    y_train = df_train.fuel_efficiency_mpg.values
    y_val = df_val.fuel_efficiency_mpg.values
    y_test = df_test.fuel_efficiency_mpg.values

    del df_train['fuel_efficiency_mpg']
    del df_val['fuel_efficiency_mpg']
    del df_test['fuel_efficiency_mpg']

    X_train = set_X(df_train, fillna_value=0)
    w_0, w = linear_regression(X_train, y_train)

    X_val = set_X(df_val, fillna_value=0)
    y_pred = w_0 + X_val.dot(w)

    result = rmse_raw(y_val, y_pred)
    print(s, result)
    
    rmses.append(result)

0 0.46104263821922264
1 0.47035678215893817
2 0.4697366372512365
3 0.4634261215128222
4 0.4574530559204262
5 0.4748644264532335
6 0.46676493323090085
7 0.4578916430716268
8 0.4618071911531348
9 0.4554222556998976


In [15]:
np.std(rmses)

np.float64(0.006053774732736977)

### Pregunta 6: Evaluación en dataset test

In [16]:
n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [19]:
df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)

X_full_train = set_X(df_full_train, fillna_value=0)
y_full_train = np.concatenate([y_train, y_val])
w_0, w = linear_regression_reg(X_full_train, y_full_train, r=0.001)

X_test = set_X(df_test, fillna_value=0)
y_pred = w_0 + X_test.dot(w)

result = rmse_raw(y_test, y_pred)
print(result)

0.4576223738013142
