In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:

column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
boston = pd.read_csv('housing.csv', header=None, delimiter=r"\s+", names=column_names)
boston.head(5)

In [None]:
dataset=boston

In [None]:
dataset.rename(columns={'MEDV': 'Price'}, inplace=True)
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
# Check missing values
dataset.isnull().sum()

In [None]:
## Exploratory data analysis
## Si es un problema de regresión mirar la correlación es muy importante. Pearson correlación se evalua con .corr()
# Si hay una alta correlación entré dos variables independientes se puede eliminar una para no tener una multicolinearilidad (<95%)
dataset.corr()

In [None]:
import seaborn as sns

In [None]:
sns.pairplot(dataset)

In [None]:
plt.scatter(dataset['CRIM'], dataset['Price'])
plt.xlabel('crime rate')
plt.ylabel('Price')

In [None]:
plt.scatter(dataset['RM'], dataset['Price'])
plt.xlabel('crime rate')
plt.ylabel('Price')

In [None]:
sns.regplot(x='RM', y='Price', data=dataset)

In [None]:
sns.regplot(x='LSTAT', y='Price', data=dataset)

In [None]:
#Independent and dependent features
X=dataset.iloc[:,:-1]
y=dataset['Price']

In [None]:
y

In [None]:
## Train test Split
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.3, random_state=1)

In [None]:
X_train

In [None]:
X_test

### Standardize the dataset
La estandarización del dataset se hace porque en la regresión se debe hallar el mínimo global con el descenso de gradiente
y para esto, todas las caracteristicas deben estar en la misma escala



In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
X_train=scaler.fit_transform(X_train)

In [None]:
X_test=scaler.fit_transform(X_test)

In [None]:
X_train

In [None]:
X_test

## Model training

In [None]:
from sklearn.linear_model import LinearRegression


In [None]:
regression= LinearRegression()

In [None]:
regression.fit(X_train, y_train)

In [None]:
#Print the coefficients and the intercept
regression.coef_

In [None]:
regression.intercept_

In [None]:
#on wich parameter the model has been trained
regression.get_params()

In [None]:
##Prediction with test data
reg_pred= regression.predict(X_test)
reg_pred

In [None]:
#Plot scatter prediction vs test
plt.scatter(y_test, reg_pred)
#Cuando el plot es linear es indicador que el modelo trabaja bien

In [None]:
## Errors = residual
residual = reg_pred - y_test
residual

In [None]:
## Plot residual, se observa una distrubución normal, lo cual es bueno
sns.displot(residual, kind='kde')

In [None]:
plt.scatter(reg_pred, residual)

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error


In [None]:
mean_absolute_error(y_test, reg_pred)

In [None]:
mean_squared_error(y_test, reg_pred)

In [None]:
np.sqrt(mean_squared_error(y_test, reg_pred))

## R squared and adjusted R squared
El R cuadrado es un número que varía entre 0 y 1, y se interpreta como la proporción de la variabilidad de la variable dependiente que puede ser explicada por la variable independiente o predictor en el modelo de regresión. 
#### Formula: 
R^2= 1-(SSR/SST)

In [None]:
from sklearn.metrics import r2_score

In [None]:
r_square=r2_score(y_test, reg_pred)
r_square

### Predicción con datos nuevos

In [None]:
# convirtiendo el dataframe a un array de np
boston = X.values

In [None]:
#Para obtener una predicción debo conertirlo en una matriz de dos dimensiones y estandarizar los datos
boston[0].reshape(1,-1)
#transformation of new data
scaler.transform(boston[0].reshape(1,-1))

In [None]:
boston[0].reshape(1,-1)

In [None]:
regression.predict(scaler.transform(boston[0].reshape(1,-1)))

### Pickling the model file for deployment

In [None]:
import pickle

In [None]:
pickle.dump(regression, open('regModel.pkl', 'wb'))

In [None]:
pickle_model = pickle.load(open('regModel.pkl', 'rb'))

In [None]:
pickle_model.predict(scaler.transform(boston[0].reshape(1,-1)))