In [84]:
DF_PATH = "../data/processed/01_preprocessed_df.pkl"
EXPORT_PATH = "../data/processed/02_preprocessed_df.pkl"


In [37]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle


In [38]:
df = pd.read_pickle(DF_PATH)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_house,bedrooms_ratio,people_per_house
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
 10  rooms_per_house     20640 non-null  float64
 11  bedrooms_ratio      20433 non-null  float64
 12  people_per_house    20640 non-null  float64
dtypes: float64(12), object(1)
memory usage: 2.0+ MB


-----------

### handiling missing values

In [14]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
rooms_per_house         0
bedrooms_ratio        207
people_per_house        0
dtype: int64

In [36]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')

Separating out the numerical attributes to use the `"median"` strategy (as it cannot be calculated on text attributes like `ocean_proximity`):

In [39]:
numeric_df = df.select_dtypes(include=[np.number])

In [40]:
imputer.fit(numeric_df)

In [41]:
imputer.statistics_

array([-1.19569704e+02,  3.56318614e+01,  2.86394864e+01,  2.63576308e+03,
        5.37870553e+02,  1.42547674e+03,  4.99539680e+02,  3.87067100e+00,
        2.06855817e+05,  5.42899974e+00,  2.13038830e-01,  3.07065516e+00])

In [42]:
numeric_df.median().values

array([-1.18490000e+02,  3.42600000e+01,  2.90000000e+01,  2.12700000e+03,
        4.35000000e+02,  1.16600000e+03,  4.09000000e+02,  3.53480000e+00,
        1.79700000e+05,  5.22912879e+00,  2.03162434e-01,  2.81811565e+00])

The imputer has simply computed the median of each attribute and stored the result
in its statistics_ instance variable

Now you can use this “trained” imputer to transform the training set by replacing
missing values by the learned medians

In [43]:
X = imputer.transform(numeric_df)

In [44]:
new_df = pd.DataFrame(X, columns=numeric_df.columns, index=numeric_df.index)
new_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_house,bedrooms_ratio,people_per_house
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,8.288136,0.129516,2.80226
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,6.281853,0.172096,2.181467


In [45]:
new_df.shape

(20640, 12)

In [46]:
new_df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
rooms_per_house       0
bedrooms_ratio        0
people_per_house      0
dtype: int64

-----------

### drop outliers

In [47]:
from sklearn.ensemble import IsolationForest

isolation_forest = IsolationForest(max_samples=100, random_state=42)
pred_outliers = isolation_forest.fit_predict(X)

In [48]:
pred_outliers

array([-1, -1, -1, ...,  1,  1,  1])

In [49]:
new_df = new_df.iloc[pred_outliers == 1]

In [50]:
new_df.shape

(17514, 12)

As we see that our data drop from 20640 to 17514 rows.

--------

### handiling catergories

In [51]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17514 entries, 3 to 20639
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           17514 non-null  float64
 1   latitude            17514 non-null  float64
 2   housing_median_age  17514 non-null  float64
 3   total_rooms         17514 non-null  float64
 4   total_bedrooms      17514 non-null  float64
 5   population          17514 non-null  float64
 6   households          17514 non-null  float64
 7   median_income       17514 non-null  float64
 8   median_house_value  17514 non-null  float64
 9   rooms_per_house     17514 non-null  float64
 10  bedrooms_ratio      17514 non-null  float64
 11  people_per_house    17514 non-null  float64
dtypes: float64(12)
memory usage: 1.7 MB


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
 10  rooms_per_house     20640 non-null  float64
 11  bedrooms_ratio      20433 non-null  float64
 12  people_per_house    20640 non-null  float64
dtypes: float64(12), object(1)
memory usage: 2.0+ MB


In [53]:
cat = df.select_dtypes(include=['object'])

> One issue with `LabelEncoder()` representation is that ML algorithms will assume that two nearby
values are more similar than two distant values. Obviously this is not the case (for
example, categories 0 and 4 are more similar than categories 0 and 1).

To fix this issue, a common solution is to create one binary attribute per category: one attribute
equal to 1 when the category is “<1H OCEAN” (and 0 otherwise), another attribute
equal to 1 when the category is “INLAND” (and 0 otherwise), and so on. This is
called one-hot encoding, because only one attribute will be equal to 1 (hot), while the
others will be 0 (cold).


In [60]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(cat)

In [61]:
housing_cat_1hot

<20640x5 sparse matrix of type '<class 'numpy.float64'>'
	with 20640 stored elements in Compressed Sparse Row format>

By default, the `OneHotEncoder` class returns a sparse array, but we can convert it to a dense array if needed by calling the `toarray()` method:

In [62]:
housing_cat_1hot.toarray()

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

Alternatively, you can set `sparse=False` when creating the `OneHotEncoder`:

In [63]:
cat_encoder = OneHotEncoder(sparse=False)
housing_cat_1hot = cat_encoder.fit_transform(cat)
housing_cat_1hot



array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [64]:
housing_cat_1hot.shape

(20640, 5)

In [73]:
df_output = pd.DataFrame(housing_cat_1hot,
                         columns=cat_encoder.get_feature_names_out(),
                         index=cat.index)

In [74]:
df_output.head()

Unnamed: 0,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0


In [76]:
df_output.shape

(20640, 5)

In [77]:
data = pd.concat([new_df, df_output], axis=1)
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_house,bedrooms_ratio,people_per_house,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,5.817352,0.184458,2.547945,0.0,0.0,0.0,1.0,0.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,6.281853,0.172096,2.181467,0.0,0.0,0.0,1.0,0.0
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,4.761658,0.231774,2.139896,0.0,0.0,0.0,1.0,0.0
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,4.931907,0.192899,2.128405,0.0,0.0,0.0,1.0,0.0
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,4.797527,0.221327,1.788253,0.0,0.0,0.0,1.0,0.0


In [78]:
data.shape

(20640, 17)

> we 'll need to drop the NAs from the data which created from the difference in rows in our new_df and df_output

In [79]:
data.isnull().sum()

longitude                     3126
latitude                      3126
housing_median_age            3126
total_rooms                   3126
total_bedrooms                3126
population                    3126
households                    3126
median_income                 3126
median_house_value            3126
rooms_per_house               3126
bedrooms_ratio                3126
people_per_house              3126
ocean_proximity_<1H OCEAN        0
ocean_proximity_INLAND           0
ocean_proximity_ISLAND           0
ocean_proximity_NEAR BAY         0
ocean_proximity_NEAR OCEAN       0
dtype: int64

In [80]:
data1 = data.dropna()

In [81]:
data1.isnull().sum()

longitude                     0
latitude                      0
housing_median_age            0
total_rooms                   0
total_bedrooms                0
population                    0
households                    0
median_income                 0
median_house_value            0
rooms_per_house               0
bedrooms_ratio                0
people_per_house              0
ocean_proximity_<1H OCEAN     0
ocean_proximity_INLAND        0
ocean_proximity_ISLAND        0
ocean_proximity_NEAR BAY      0
ocean_proximity_NEAR OCEAN    0
dtype: int64

In [82]:
data1.shape

(17514, 17)

In [83]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17514 entries, 3 to 20639
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   longitude                   17514 non-null  float64
 1   latitude                    17514 non-null  float64
 2   housing_median_age          17514 non-null  float64
 3   total_rooms                 17514 non-null  float64
 4   total_bedrooms              17514 non-null  float64
 5   population                  17514 non-null  float64
 6   households                  17514 non-null  float64
 7   median_income               17514 non-null  float64
 8   median_house_value          17514 non-null  float64
 9   rooms_per_house             17514 non-null  float64
 10  bedrooms_ratio              17514 non-null  float64
 11  people_per_house            17514 non-null  float64
 12  ocean_proximity_<1H OCEAN   17514 non-null  float64
 13  ocean_proximity_INLAND      17514 no

----------

### EXPORT

In [85]:
pd.to_pickle(data1, EXPORT_PATH)