## Fetch Data

In [None]:
import os
import tarfile
import urllib
import random

random_seed = 32
random.seed(random_seed)

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
fetch_housing_data()

## Load Data

In [None]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

## Analyzing the Data

In [None]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Playing with Pandas

In [None]:
type(housing)

pandas.core.frame.DataFrame

In [None]:
housing.shape

(20640, 10)

### sample, head, and tail

In [None]:
housing.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
6940,-118.1,33.99,35.0,1326.0,272.0,933.0,267.0,3.4306,162500.0,<1H OCEAN
12196,-117.43,33.55,8.0,446.0,62.0,188.0,68.0,9.4356,465600.0,<1H OCEAN
16343,-121.33,38.04,15.0,2903.0,440.0,1325.0,423.0,4.5179,145600.0,INLAND
14616,-117.17,32.8,20.0,2827.0,554.0,1822.0,536.0,3.4706,157600.0,NEAR OCEAN
17123,-122.13,37.46,37.0,1576.0,334.0,1385.0,323.0,2.5294,159400.0,NEAR BAY


### Selecting only specific columns

In [None]:
housing["population"]

0         322.0
1        2401.0
2         496.0
3         558.0
4         565.0
          ...  
20635     845.0
20636     356.0
20637    1007.0
20638     741.0
20639    1387.0
Name: population, Length: 20640, dtype: float64

In [None]:
type(housing["population"])

pandas.core.series.Series

In [None]:
housing_pop_inc = housing[["population", "median_income"]]
housing_pop_inc

Unnamed: 0,population,median_income
0,322.0,8.3252
1,2401.0,8.3014
2,496.0,7.2574
3,558.0,5.6431
4,565.0,3.8462
...,...,...
20635,845.0,1.5603
20636,356.0,2.5568
20637,1007.0,1.7000
20638,741.0,1.8672


In [None]:
type(housing_pop_inc)

pandas.core.frame.DataFrame

### Selecting elements by criteria

In [None]:
housing["housing_median_age"] < 30

0        False
1         True
2        False
3        False
4        False
         ...  
20635     True
20636     True
20637     True
20638     True
20639     True
Name: housing_median_age, Length: 20640, dtype: bool

In [None]:
housing[housing["housing_median_age"] < 30]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
50,-122.27,37.82,21.0,896.0,453.0,735.0,438.0,0.9218,171900.0,NEAR BAY
59,-122.29,37.82,2.0,158.0,43.0,94.0,57.0,2.5625,60000.0,NEAR BAY
70,-122.29,37.81,26.0,768.0,152.0,392.0,127.0,1.7719,82500.0,NEAR BAY
74,-122.29,37.81,20.0,835.0,161.0,290.0,133.0,2.4830,137500.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


### groupby

In [None]:
housing.groupby("ocean_proximity").mean()

Unnamed: 0_level_0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
ocean_proximity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
<1H OCEAN,-118.847766,34.560577,29.279225,2628.343586,546.539185,1520.290499,517.744965,4.230682,240084.285464
INLAND,-119.73299,36.731829,24.271867,2717.742787,533.881619,1391.046252,477.447565,3.208996,124805.392001
ISLAND,-118.354,33.358,42.4,1574.6,420.4,668.0,276.6,2.74442,380440.0
NEAR BAY,-122.260694,37.801057,37.730131,2493.58952,514.182819,1230.317467,488.616157,4.172885,259212.31179
NEAR OCEAN,-119.332555,34.738439,29.347254,2583.700903,538.615677,1354.008653,501.244545,4.005785,249433.977427


In [None]:
housing.groupby("ocean_proximity").count()

Unnamed: 0_level_0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
ocean_proximity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
<1H OCEAN,9136,9136,9136,9136,9034,9136,9136,9136,9136
INLAND,6551,6551,6551,6551,6496,6551,6551,6551,6551
ISLAND,5,5,5,5,5,5,5,5,5
NEAR BAY,2290,2290,2290,2290,2270,2290,2290,2290,2290
NEAR OCEAN,2658,2658,2658,2658,2628,2658,2658,2658,2658


### sum by columns or rows

In [None]:
housing.sum()

longitude                                                    -2467918.7
latitude                                                      735441.62
housing_median_age                                             591119.0
total_rooms                                                  54402150.0
total_bedrooms                                               10990309.0
population                                                   29421840.0
households                                                   10310499.0
median_income                                                79890.6495
median_house_value                                         4269504061.0
ocean_proximity       NEAR BAYNEAR BAYNEAR BAYNEAR BAYNEAR BAYNEAR B...
dtype: object

In [None]:
housing.sum(axis=1)

  housing.sum(axis=1)


0        454021.9752
1        370188.9414
2        354404.8674
3        343559.2431
4        344902.4462
            ...     
20635     81258.9503
20636     78355.8368
20637     96415.9100
20638     87996.9772
20639     94654.5186
Length: 20640, dtype: float64

### isna

In [None]:
housing.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

### info

In [None]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


### describe

In [None]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


### value_counts

In [None]:
housing["ocean_proximity"].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

### columns

In [None]:
housing.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')