# Obtención y preparación de los datos

### sklearn.datasets
https://scikit-learn.org/stable/datasets/index.html

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()

In [None]:
dataset_description = boston.DESCR

In [None]:
print(dataset_description)

In [None]:
import pandas as pd
df = pd.DataFrame(boston.data, columns = boston.feature_names)
df.head()

In [None]:
df.shape

In [None]:
df['MEDV'] = boston.target[df.index]
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

correlation = df.corr()
sns.set(style="ticks", color_codes=True)
plt.figure(figsize=(14, 12))
sns.heatmap(correlation,annot=True, linewidths=.5, cmap="YlGnBu")

### Selección de características

In [None]:
cor_target = abs(correlation["MEDV"])
relevant_features = cor_target[cor_target>0.5]
relevant_features

Attribute Information (in order):

- CRIM     per capita crime rate by town
- ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS    proportion of non-retail business acres per town
- CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX      nitric oxides concentration (parts per 10 million)
- RM       average number of rooms per dwelling
- AGE      proportion of owner-occupied units built prior to 1940
- DIS      weighted distances to five Boston employment centres
- RAD      index of accessibility to radial highways
- TAX      full-value property-tax rate per 10,000
- PTRATIO  pupil-teacher ratio by town
- B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT    % lower status of the population
- MEDV     Median value of owner-occupied homes in 1000's

In [None]:
sns.pairplot(df, vars=["RM", "PTRATIO", "LSTAT","MEDV"])

# Creación del primer modelo

### sklearn.linear_model.LinearRegression
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [None]:
Y = df["MEDV"]
print(Y)

In [None]:
from sklearn.linear_model import LinearRegression
X = df["LSTAT"].values.reshape(-1,1)
Y = df["MEDV"].values.reshape(-1, 1)

In [None]:
print(Y)

In [None]:
regression = LinearRegression().fit(X,Y)
y_prediccion = regression.predict(X)
print(y_prediccion)

In [None]:
plt.scatter(X, Y)
plt.plot(X, y_prediccion,"r-")

### Obteniendo el modelo

$y = mx +b$

In [None]:
m = regression.coef_
print(m)

In [None]:
b = regression.intercept_
print(b)

In [None]:
x_propuesta = 30

In [None]:
y_obtenida = m * x_propuesta + b
print(y_obtenida)

$ f(x) = -0.95004935x + 34.55384088 $

Alternativa proporcionada utilizando sklearn.linear_model

In [None]:
y_obtenida = regression.predict([[x_propuesta]])
print(y_obtenida)

<b>Actualización 2022:</b>
<br>
<br>Se modifica el parametro que recibe la función <b>regression.predict(x_propuesta)</b> por el mismo parametro pero dentro de un doble parentesis cuadrado. Puesto que ahora la función necesita recibir un arreglo de dos dimensiones.
<br>
<br>Quedando de la siguiente manera: <b>regression.predict([[x_propuesta]])</b>

### Evaluando el modelo

<br><b>Metrics</b> 
<br>https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
<br>
<br><b>R2_score</b>  
<br>https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html#sklearn.metrics.r2_score

In [None]:
from sklearn.metrics import r2_score
r2_score(Y,y_prediccion)