##         Data Pre-processing


In [2]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

In [3]:
housing = pd.read_csv('../datasets/housing/housing.csv')

In [4]:
#stratified suffle split 
housing['housing_category'] = pd.cut(housing['median_income'],bins=[0.,1.5,3.0,4.5,6.,np.inf],labels=[1,2,3,4,5])


#creating a split object 
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index, test_index in split.split(housing,housing['housing_category']):      #it split funcation returns index and not value
    strat_train_set = housing.iloc[train_index]
    strat_test_set = housing.iloc[test_index]

print("Training Set :",strat_train_set.head())

Training Set :        longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
12655    -121.46     38.52                29.0       3873.0           797.0   
15502    -117.23     33.09                 7.0       5320.0           855.0   
2908     -119.04     35.37                44.0       1618.0           310.0   
14053    -117.13     32.75                24.0       1877.0           519.0   
20496    -118.70     34.28                27.0       3536.0           646.0   

       population  households  median_income  median_house_value  \
12655      2237.0       706.0         2.1736             72100.0   
15502      2015.0       768.0         6.3373            279600.0   
2908        667.0       300.0         2.8750             82700.0   
14053       898.0       483.0         2.2264            112500.0   
20496      1837.0       580.0         4.4964            238300.0   

      ocean_proximity housing_category  
12655          INLAND                2  
15502      NEAR OCE

In [5]:
housing_data = strat_train_set.copy()

## Data Preparation 


### Data Cleaning 

In [9]:
#seperating predicated and labels from the data
housing = housing_data.drop("median_house_value",axis=1)
label = housing_data['median_house_value'].copy()



In [10]:
label

12655     72100.0
15502    279600.0
2908      82700.0
14053    112500.0
20496    238300.0
           ...   
15174    268500.0
12661     90400.0
19263    140400.0
19140    258100.0
19773     62700.0
Name: median_house_value, Length: 16512, dtype: float64

In [11]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,housing_category
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND,2
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN,5
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750,INLAND,2
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN,2
20496,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN,3
...,...,...,...,...,...,...,...,...,...,...
15174,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900,<1H OCEAN,4
12661,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139,INLAND,2
19263,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797,<1H OCEAN,3
19140,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964,<1H OCEAN,3


In [12]:
housing.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        158
population              0
households              0
median_income           0
ocean_proximity         0
housing_category        0
dtype: int64

In [13]:
# In Total Bedroom there are missing values

#option 1 - Drop entire column (total_bedroom)
#housing.drop("total_bedrooms",axis=1)

#option 2 - Drop the rows which contain missing valiue
#housing.dropna(subset=["total_bedrooms"])

#option 3 - Fill the missing values with mean of median
median = housing["total_bedrooms"].median() 
housing["total_bedrooms"].fillna(median,inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  housing["total_bedrooms"].fillna(median,inplace=True)


In [14]:
housing.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
housing_category      0
dtype: int64

In [16]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 12655 to 19773
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   longitude           16512 non-null  float64 
 1   latitude            16512 non-null  float64 
 2   housing_median_age  16512 non-null  float64 
 3   total_rooms         16512 non-null  float64 
 4   total_bedrooms      16512 non-null  float64 
 5   population          16512 non-null  float64 
 6   households          16512 non-null  float64 
 7   median_income       16512 non-null  float64 
 8   ocean_proximity     16512 non-null  object  
 9   housing_category    16512 non-null  category
dtypes: category(1), float64(8), object(1)
memory usage: 1.3+ MB


In [20]:
#drop the housing_category column
housing.drop("housing_category",axis=1,inplace=True)

In [48]:
#housing.info()
print(list(housing))

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity']


### Converting category to Numeric Value

In [23]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

housing_cat = housing[["ocean_proximity"]]
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)


In [24]:
housing_cat_encoded

array([[1.],
       [4.],
       [1.],
       ...,
       [0.],
       [0.],
       [1.]], shape=(16512, 1))

In [25]:
ordinal_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

## One Hot Encoding 

In [26]:
from sklearn.preprocessing import OneHotEncoder
category_encoder = OneHotEncoder()

housing_cat_1hot = category_encoder.fit_transform(housing_cat)


In [27]:
housing_cat_1hot

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 16512 stored elements and shape (16512, 5)>

In [32]:
housing_cat_1hot.toarray()


array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]], shape=(16512, 5))

## Custom Transformer

In [33]:
# didn't understood


## Feature Scaling 

In [40]:
import numpy as np

def MinMaxScalar(data, min_val, max_val):
    newarray = []
    for value in data:
        result = (value - min_val) / (max_val - min_val)
        newarray.append(result)
    return np.array(newarray)

test_dataset1 = np.array([10,23,34,23,43,101])
mn = np.min(test_dataset1)
mx = np.max(test_dataset1)

newarray = MinMaxScalar(test_dataset1, min_val=mn, max_val=mx)
print(newarray)

[0.         0.14285714 0.26373626 0.14285714 0.36263736 1.        ]


### Transformation Pipelines

In [47]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


#creating a pipeline for numerical data
# 1. Fill null values using median 
# 2. Standard Scalar

num_pipeline = Pipeline([
    ('fill_null',SimpleImputer(strategy='median')),
    ('std_scalar',StandardScaler()),
])


# using the number pipeline on housing dataset

housing_num = housing.drop("ocean_proximity",axis=1)
housing_num_tr = num_pipeline.fit_transform(housing_num)

print(housing_num_tr)




[[-0.94135046  1.34743822  0.02756357 ...  0.73260236  0.55628602
  -0.8936472 ]
 [ 1.17178212 -1.19243966 -1.72201763 ...  0.53361152  0.72131799
   1.292168  ]
 [ 0.26758118 -0.1259716   1.22045984 ... -0.67467519 -0.52440722
  -0.52543365]
 ...
 [-1.5707942   1.31001828  1.53856552 ... -0.86201341 -0.86511838
  -0.36547546]
 [-1.56080303  1.2492109  -1.1653327  ... -0.18974707  0.01061579
   0.16826095]
 [-1.28105026  2.02567448 -0.13148926 ... -0.71232211 -0.79857323
  -0.390569  ]]


In [None]:
#Column Transformer - This transformation class allows handeling of both numerical and categorical value together.

from sklearn.compose import ColumnTransformer

num_attributes = list(housing_num). # These are column names of all the numerical dat columns
cat_attributes  = ['ocean_proximity']


full_pipeline = ColumnTransformer([

    ('num',num_pipeline,num_attributes),
    ('cat',OneHotEncoder(),cat_attributes),
])

#using full pipeline
housing_prepared = full_pipeline.fit_transform(housing)




In [50]:
housing_prepared

array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.26758118, -0.1259716 ,  1.22045984, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.        ,
         0.        ,  0.        ]], shape=(16512, 13))