# Regresión Lineal Múltiple


## Cómo importar las librerías


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importar el data set


In [3]:
dataset = pd.read_csv('50_Startups.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values


## Codificar datos categóricos

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
encoded = x.copy()
labelEncoderX = LabelEncoder()
encoded[:, 3] = labelEncoderX.fit_transform(encoded[:, 3])
oneHotEncoder = make_column_transformer((OneHotEncoder(), [3]), remainder = "passthrough")
encoded = oneHotEncoder.fit_transform(encoded)
# Evitar la trampa de las variables ficticias eliminamos una de las columnas dummy
encoded = encoded[:, 1:]
print(encoded[:5,:])

[[0.0 1.0 165349.2 136897.8 471784.1]
 [0.0 0.0 162597.7 151377.59 443898.53]
 [1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 1.0 144372.41 118671.85 383199.62]
 [1.0 0.0 142107.34 91391.77 366168.42]]


## Conjunto de entrenamiento y testing

Dividir el data set en conjunto de entrenamiento y conjunto de testing

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(encoded, y, test_size = 0.2, random_state = 0)

## Ajustar el modelo de Regresión lineal múltiple con el conjunto de entrenamiento


In [6]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(x_train, y_train)
# Predicción de los resultados en el conjunto de testing
y_pred = regression.predict(x_test)
# Redondeamos los valores de las predicciones
y_pred = [round(valor, 2) for valor in y_pred]
print(x_train[:5,:])
print(" valores reales de test")
print(y_test[:5])
print(" predicciones de test")
print(y_pred[:5])

[[1.0 0.0 55493.95 103057.49 214634.81]
 [0.0 1.0 46014.02 85047.44 205517.64]
 [1.0 0.0 75328.87 144135.98 134050.07]
 [0.0 0.0 46426.07 157693.92 210797.67]
 [1.0 0.0 91749.16 114175.79 294919.57]]
 valores reales de test
[103282.38 144259.4  146121.95  77798.83 191050.39]
 predicciones de test
[103015.2, 132582.28, 132447.74, 71976.1, 178537.48]


## Construir el modelo óptimo de RLM utilizando la Eliminación hacia atrás

#Se ha añadido el modificador .tolist() al X_opt para adaptarse a Python 3.7

In [7]:
import statsmodels.api as sm
X = np.append(arr = np.ones((50,1)).astype(int), values = encoded, axis = 1)
print(X[:5,:])
SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
regression_OLS = sm.OLS(endog = y, exog = X_opt.tolist()).fit()
print("Todas las variables")
print(regression_OLS.summary())

X_opt = X[:, [0, 1, 3, 4, 5]]
regression_OLS = sm.OLS(endog = y, exog = X_opt.tolist()).fit()
print("Eliminamos la variable 2")
print(regression_OLS.summary())

X_opt = X[:, [0, 3, 4, 5]]
regression_OLS = sm.OLS(endog = y, exog = X_opt.tolist()).fit()
print("Eliminamos la variable 1 y 2")
print(regression_OLS.summary())

X_opt = X[:, [0, 3, 5]]
regression_OLS = sm.OLS(endog = y, exog = X_opt.tolist()).fit()
print("Eliminamos la variable 1, 2 y 4")
regression_OLS.summary()

X_opt = X[:, [0, 3]]
regression_OLS = sm.OLS(endog = y, exog = X_opt.tolist()).fit()
print("Eliminamos la variable 1, 2, 4 y 5")
regression_OLS.summary()

[[1 0.0 1.0 165349.2 136897.8 471784.1]
 [1 0.0 0.0 162597.7 151377.59 443898.53]
 [1 1.0 0.0 153441.51 101145.55 407934.54]
 [1 0.0 1.0 144372.41 118671.85 383199.62]
 [1 1.0 0.0 142107.34 91391.77 366168.42]]
Todas las variables
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     169.9
Date:                Sat, 27 Jan 2024   Prob (F-statistic):           1.34e-27
Time:                        13:47:56   Log-Likelihood:                -525.38
No. Observations:                  50   AIC:                             1063.
Df Residuals:                      44   BIC:                             1074.
Df Model:                           5                                         
Covariance Type:            nonrobust                     

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Sat, 27 Jan 2024",Prob (F-statistic):,3.5000000000000004e-32
Time:,13:47:56,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


## Mejora utilizando una función backward elimination con p-valores solamente:

In [16]:
import statsmodels.api as sm
def backwardElimination(x, sl):    
    numVars = len(x[0])    
    for i in range(0, numVars):        
        regressor_OLS = sm.OLS(y, x.tolist()).fit()        
        maxVar = max(regressor_OLS.pvalues).astype(float)        
        if maxVar > sl:            
            for j in range(0, numVars - i):                
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):                    
                    x = np.delete(x, j, 1)    
    regressor_OLS.summary()    
    return x

def backwardElimination2(x, SL):    
    numVars = len(x[0])    
    temp = np.zeros((50,6)).astype(int)    
    for i in range(0, numVars):        
        regressor_OLS = sm.OLS(y, x.tolist()).fit()        
        maxVar = max(regressor_OLS.pvalues).astype(float)        
        adjR_before = regressor_OLS.rsquared_adj.astype(float)        
        if maxVar > SL:            
            for j in range(0, numVars - i):                
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):                    
                    temp[:,j] = x[:, j]                    
                    x = np.delete(x, j, 1)                    
                    tmp_regressor = sm.OLS(y, x.tolist()).fit()                    
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)                    
                    if (adjR_before >= adjR_after):                        
                        x_rollback = np.hstack((x, temp[:,[0,j]]))                        
                        x_rollback = np.delete(x_rollback, j, 1)     
                        regressor_OLS.summary()                        
                        return x_rollback                    
                    else:                        
                        continue    
    regressor_OLS.summary()    
    return x 
 
 
SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, SL)
print("backward elimination:")
print(X_Modeled[:5,:])
 
X_Modeled2 = backwardElimination2(X_opt, SL)
print("backward elimination 2:")
print(X_Modeled2[:5,:])

backward elimination
[[1 165349.2]
 [1 162597.7]
 [1 153441.51]
 [1 144372.41]
 [1 142107.34]]
backward elimination 2
[[1 165349.2 471784]
 [1 162597.7 443898]
 [1 153441.51 407934]
 [1 144372.41 383199]
 [1 142107.34 366168]]
