# Machine Learning End-to-End

In [82]:
import os
import tarfile
from six.moves import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "DatasetsML\housing"
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

In [83]:
import pandas as pd
def load_housing_data(housing_path="DatasetsML\housing"):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [84]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [85]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.5+ MB


In [86]:
median = housing["total_bedrooms"].median() # option 3
housing["total_bedrooms"].fillna(median, inplace=True)

In [87]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.5+ MB


In [88]:
housing["ocean_proximity"].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [89]:
from sklearn.preprocessing import OneHotEncoder 
import numpy as np
encoder = OneHotEncoder() 
cat = np.array(housing["ocean_proximity"]).reshape(-1,1)
cat_1hot = encoder.fit_transform(cat)
cat_1hot= cat_1hot.toarray()



In [90]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.5+ MB


In [91]:
housing.drop("ocean_proximity", axis=1, inplace=True)


In [92]:
pdCat1hot=pd.DataFrame(data=cat_1hot )
pdCat1hot.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       20640 non-null  float64
 1   1       20640 non-null  float64
 2   2       20640 non-null  float64
 3   3       20640 non-null  float64
 4   4       20640 non-null  float64
dtypes: float64(5)
memory usage: 806.3 KB


In [93]:
#newData = housing.append(pdCat1hot, ignore_index=True)
X = pd.concat([housing, pdCat1hot], axis=1, ignore_index=True)


In [94]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0.0,0.0,0.0,1.0,0.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0.0,0.0,0.0,1.0,0.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0.0,0.0,0.0,1.0,0.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0.0,0.0,0.0,1.0,0.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0.0,0.0,0.0,1.0,0.0


In [109]:

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(X, test_size=0.2, random_state=42)
print(len(train_set), "train +", len(test_set), "test")
train_set

16512 train + 4128 test


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,103000.0,0.0,0.0,0.0,0.0,1.0
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,382100.0,0.0,0.0,0.0,0.0,1.0
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,172600.0,0.0,0.0,0.0,0.0,1.0
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,93400.0,0.0,0.0,0.0,0.0,1.0
2271,-119.80,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,96500.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,-117.96,33.78,35.0,1330.0,201.0,658.0,217.0,6.3700,229200.0,1.0,0.0,0.0,0.0,0.0
11964,-117.43,34.02,33.0,3084.0,570.0,1753.0,449.0,3.0500,97800.0,0.0,1.0,0.0,0.0,0.0
5390,-118.38,34.03,36.0,2101.0,569.0,1756.0,527.0,2.9344,222100.0,1.0,0.0,0.0,0.0,0.0
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0,1.0,0.0,0.0,0.0,0.0


In [104]:
y_train, y_test = train_set[8], test_set[8]
x_train = train_set.drop(8, axis=1)
x_test = test_set.drop(8, axis=1)

In [97]:
x_test.head()


Unnamed: 0,0,1,2,3,4,5,6,7,9,10,11,12,13
20046,-119.01,36.06,25.0,1505.0,435.0,1392.0,359.0,1.6812,0.0,1.0,0.0,0.0,0.0
3024,-119.46,35.14,30.0,2943.0,435.0,1565.0,584.0,2.5313,0.0,1.0,0.0,0.0,0.0
15663,-122.44,37.8,52.0,3830.0,435.0,1310.0,963.0,3.4801,0.0,0.0,0.0,1.0,0.0
20484,-118.72,34.28,17.0,3051.0,435.0,1705.0,495.0,5.7376,1.0,0.0,0.0,0.0,0.0
9814,-121.93,36.62,34.0,2351.0,435.0,1063.0,428.0,3.725,0.0,0.0,0.0,0.0,1.0


# Training	and	Evaluating	on	the	Training	Set 

In [98]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression() 

lin_reg.fit(x_train,y_train)
resultado=lin_reg.predict(x_test)


In [99]:
resultado

array([ 54055.44889898, 124225.33893718, 255489.37949165, ...,
       439180.98341181, 120797.5524062 , 183386.04993584])

In [100]:
from sklearn.metrics import mean_squared_error
print( mean_squared_error (y_test, resultado))

4908476721.156625


In [101]:
from sklearn.metrics import mean_absolute_error
print( mean_absolute_error (y_test, resultado))

50670.73824097177
