In [99]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd


# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns

# Gráficos
# ==============================================================================
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Asunciones y Preprocesamiento
# ==============================================================================
from scipy import stats
import math
from scipy.stats import levene
import researchpy as rp
from sklearn.preprocessing import StandardScaler
import itertools

# ANOVA
# ==============================================================================
import statsmodels.api as sm
from statsmodels.formula.api import ols


#Linear Regresion con Sklearn
# ==============================================================================
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics




# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')


# Tamaño gráficas
# ==============================================================================
plt.rcParams["figure.figsize"] = (10,8)

In [100]:
df = pd.read_csv("/mnt/c/Users/USUARIO/Desktop/Adalab/-project-da-promo-A-module-3-team-2/df_estandarizado.csv", index_col = 0)
df.head(2)

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,instant
0,0,0,1,0,1.497785,-1.469753,1.112127,-0.827613,-0.680818,1.252343,-0.387833,-0.755455,-1.927745,-1.820972,-1.72968
1,0,0,1,0,-1.495052,-1.469753,1.112127,-0.722069,-0.741507,0.480996,0.748899,-1.046996,-1.91748,-1.916078,-1.724934


# ESTE MODELO ES CON LOS DATOS SIN OUTLIERS Y ESTANDARIZADO 

In [101]:
# iniciamos nuestro anova. Recordemos que la variable respuesta es el precio de las casas (median_house_value)
lm = ols('casual ~ season + yr + mnth + holiday + weekday + workingday + weathersit + temp + atemp + hum + windspeed + instant', data=df).fit()
sm.stats.anova_lm(lm)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
season,1.0,31.807019,31.807019,100.281423,3.497654e-22
yr,1.0,45.808317,45.808317,144.424828,1.936657e-30
mnth,1.0,6.443962,6.443962,20.316575,7.663137e-06
holiday,1.0,2.547091,2.547091,8.030489,0.00472894
weekday,1.0,3.630472,3.630472,11.446183,0.0007553438
workingday,1.0,201.644507,201.644507,635.746416,6.307503e-101
weathersit,1.0,30.912397,30.912397,97.460853,1.222809e-21
temp,1.0,171.618919,171.618919,541.081501,1.301596e-89
atemp,1.0,1.041063,1.041063,3.282271,0.07045021
hum,1.0,0.92584,0.92584,2.918996,0.0879751


- `F`: es un test que se utiliza para evaluar la capacidad explicativa que tiene la variable predictora sobre la variación de la variable respuesta. Es decir, pretende determinar si de entre todos los valores de la variable predictora, al menos una tiene capacidad de explicar una parte significativa de la variación de la variable respuesta.


- `PR(>F)`: se refiere al *p-valor* que, recordemos nos ayudaba a determinar si se rechaza la hipótesis nula, siendo ésta que la variable no tiene efecto sobre la variable respuesta.    

    Recordemos que:     
    - Si el valor p es menor que o igual a 0.05, debemos rechazar la hipótesis nula y concluimos que esa variable si influye en el precio de las casas. 

    - Si el valor p es mayor a 0.0,  no contamos con suficientes evidencias para rechazar la hipótesis de que las medias de población son todas iguales, por lo tanto, esa variable no tiene efecto sobre el precio de las casas.

Por lo tanto, de todos estos resultados podemos ver como todas las variables son significativas, es decir, su *p-valor* es menor de 0.05, y por lo tanto, afectan al precio de una casa.

In [102]:
lm.summary()

0,1,2,3
Dep. Variable:,casual,R-squared:,0.688
Model:,OLS,Adj. R-squared:,0.683
Method:,Least Squares,F-statistic:,132.0
Date:,"Mon, 13 Jun 2022",Prob (F-statistic):,2.1e-172
Time:,18:50:57,Log-Likelihood:,-610.14
No. Observations:,730,AIC:,1246.0
Df Residuals:,717,BIC:,1306.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.2328,0.910,-1.355,0.176,-3.019,0.553
season,0.0894,0.035,2.527,0.012,0.020,0.159
yr,1.4254,0.871,1.637,0.102,-0.284,3.135
mnth,0.0610,0.073,0.830,0.407,-0.083,0.205
holiday,-0.4008,0.130,-3.088,0.002,-0.656,-0.146
weekday,0.0772,0.021,3.658,0.000,0.036,0.119
workingday,-0.5610,0.022,-25.938,0.000,-0.603,-0.518
weathersit,-0.0921,0.028,-3.334,0.001,-0.146,-0.038
temp,0.3268,0.166,1.969,0.049,0.001,0.653

0,1,2,3
Omnibus:,81.7,Durbin-Watson:,1.098
Prob(Omnibus):,0.0,Jarque-Bera (JB):,182.683
Skew:,0.635,Prob(JB):,2.14e-40
Kurtosis:,5.096,Cond. No.,499.0


In [103]:
df.head(2)

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,instant
0,0,0,1,0,1.497785,-1.469753,1.112127,-0.827613,-0.680818,1.252343,-0.387833,-0.755455,-1.927745,-1.820972,-1.72968
1,0,0,1,0,-1.495052,-1.469753,1.112127,-0.722069,-0.741507,0.480996,0.748899,-1.046996,-1.91748,-1.916078,-1.724934


In [104]:
# lo primero que hacemos es definir cual es nuestra variable Y y nuestras variables X
X = df.drop(["registered", "cnt"], axis =1)
y = df['casual']

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)

In [106]:
# iniciamos el método de Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [107]:
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

In [108]:
def metricas(y_test, y_train, y_test_pred, y_train_pred):
    
    
    resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), metrics.mean_absolute_error(y_train, y_train_pred)],
                'MSE': [metrics.mean_squared_error(y_test, y_test_pred), metrics.mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))],
                'R2':  [metrics.r2_score(y_test, y_test_pred), metrics.r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    return df

In [109]:
results = metricas(y_test, y_train, y_pred_test, y_pred_train)
results

Unnamed: 0,MAE,MSE,RMSE,R2,set
0,7.377963e-15,6.891083e-29,8.301255e-15,1.0,test
1,7.053452e-15,6.536828000000001e-29,8.085065e-15,1.0,train


     Da r2 de 1, así que lo eliminamos

# SEGUNDO MODELO QUITANDO LOS OUTLIERS PERO SIN ESTANDARIZAR LAS VARIABLES

In [110]:
df_2= pd.read_csv("/mnt/c/Users/USUARIO/Desktop/Adalab/-project-da-promo-A-module-3-team-2/df_sin_outliers.csv", index_col= 0)
df_2.head(2)

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,0,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,0,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801


In [113]:
df_2.drop(["temp", 'casual', 'registered', 'cnt'], axis=1)

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,atemp,hum,windspeed
0,0,0,1,0,6,0,2,18.18125,80.5833,10.749882
1,0,0,1,0,0,0,2,17.68695,69.6087,16.652113
2,0,0,1,0,1,1,1,9.47025,43.7273,16.636703
3,0,0,1,0,2,1,1,10.60610,59.0435,10.739832
4,0,0,1,0,3,1,1,11.46350,43.6957,12.522300
...,...,...,...,...,...,...,...,...,...,...
725,0,1,12,0,4,1,2,11.33210,65.2917,23.458911
726,0,1,12,0,5,1,2,12.75230,59.0000,10.416557
727,0,1,12,0,6,0,2,12.12000,75.2917,8.333661
728,0,1,12,0,0,0,1,11.58500,48.3333,23.500518


In [114]:
# iniciamos nuestro anova. Recordemos que la variable respuesta es el precio de las casas (median_house_value)
lm = ols('casual ~ season + yr + mnth + holiday + weekday + workingday + weathersit + atemp + hum + windspeed', data=df_2).fit()
sm.stats.anova_lm(lm)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
season,1.0,14968670.0,14968670.0,98.935254,6.301120000000001e-22
yr,1.0,21557810.0,21557810.0,142.486081,4.299907e-30
mnth,1.0,3032587.0,3032587.0,20.043847,8.799257e-06
holiday,1.0,1198684.0,1198684.0,7.922688,0.005015383
weekday,1.0,1708533.0,1708533.0,11.29253,0.0008193017
workingday,1.0,94895730.0,94895730.0,627.212208,5.203676999999999e-100
weathersit,1.0,14547650.0,14547650.0,96.152546,2.171829e-21
atemp,1.0,80953210.0,80953210.0,535.059295,6.401274e-89
hum,1.0,701578.7,701578.7,4.637076,0.03162012
windspeed,1.0,1197143.0,1197143.0,7.912499,0.005043393


In [115]:
lm.summary()

0,1,2,3
Dep. Variable:,casual,R-squared:,0.683
Model:,OLS,Adj. R-squared:,0.679
Method:,Least Squares,F-statistic:,155.2
Date:,"Mon, 13 Jun 2022",Prob (F-statistic):,4.4999999999999995e-172
Time:,18:54:50,Log-Likelihood:,-5383.6
No. Observations:,730,AIC:,10790.0
Df Residuals:,719,BIC:,10840.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,654.1067,104.068,6.285,0.000,449.794,858.420
season,63.5716,24.397,2.606,0.009,15.673,111.470
yr,285.7507,29.185,9.791,0.000,228.453,343.048
mnth,-15.4078,7.620,-2.022,0.044,-30.368,-0.447
holiday,-264.5295,89.583,-2.953,0.003,-440.404,-88.655
weekday,27.1512,7.252,3.744,0.000,12.914,41.388
workingday,-825.8318,32.111,-25.718,0.000,-888.874,-762.789
weathersit,-107.5395,36.021,-2.985,0.003,-178.258,-36.821
atemp,45.0561,1.969,22.885,0.000,41.191,48.921

0,1,2,3
Omnibus:,82.068,Durbin-Watson:,1.098
Prob(Omnibus):,0.0,Jarque-Bera (JB):,190.915
Skew:,0.625,Prob(JB):,3.4899999999999994e-42
Kurtosis:,5.172,Cond. No.,516.0


In [116]:
# lo primero que hacemos es definir cual es nuestra variable Y y nuestras variables X
X = df_2.drop(columns= ["registered", "cnt", "temp"], axis =1)
y = df_2[['casual']]

In [117]:
X.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,atemp,hum,windspeed,casual
0,0,0,1,0,6,0,2,18.18125,80.5833,10.749882,331
1,0,0,1,0,0,0,2,17.68695,69.6087,16.652113,131
2,0,0,1,0,1,1,1,9.47025,43.7273,16.636703,120
3,0,0,1,0,2,1,1,10.6061,59.0435,10.739832,108
4,0,0,1,0,3,1,1,11.4635,43.6957,12.5223,82


In [118]:
y.head()

Unnamed: 0,casual
0,331
1,131
2,120
3,108
4,82


In [119]:
X_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42)

In [120]:
X_train.shape

(584, 11)

In [121]:
x_test.shape

(146, 11)

In [122]:
# iniciamos el método de Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [123]:
y_pred_test = pd.DataFrame(lr.predict(x_test))
y_pred_test.head()

Unnamed: 0,0
0,2795.0
1,2355.0
2,885.0
3,227.0
4,1100.0


In [124]:
y_pred_test['reales'] = y_test

In [125]:
y_pred_test

Unnamed: 0,0,reales
0,2795.0,
1,2355.0,
2,885.0,120.0
3,227.0,
4,1100.0,
...,...,...
141,163.0,
142,244.0,
143,1298.0,
144,773.0,


In [126]:
y_pred_test = lr.predict(x_test)
y_pred_train = lr.predict(X_train)

In [127]:
def metricas(y_test, y_train, y_test_pred, y_train_pred): 
    resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), 
                          metrics.mean_absolute_error(y_train, y_train_pred)], 
                  'MSE': [metrics.mean_squared_error(y_test, y_test_pred), 
                          metrics.mean_squared_error(y_train, y_train_pred)], 
                  'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), 
                           np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))], 
                  'R2': [metrics.r2_score(y_test, y_test_pred), 
                         metrics.r2_score(y_train, y_train_pred)], "set": ["test", "train"]} 
    df = pd.DataFrame(resultados) 
    return df 

In [128]:
results = metricas(y_test, y_train, y_pred_test, y_pred_train)
results

Unnamed: 0,MAE,MSE,RMSE,R2,set
0,4.149375e-13,2.637121e-25,5.135291e-13,1.0,test
1,3.691914e-13,2.255727e-25,4.749449e-13,1.0,train
