In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.model_selection  import StratifiedShuffleSplit

from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.compose import ColumnTransformer
from predict_function import predict



house = pd.read_csv('../housing.csv')

X = house.drop('median_house_value', axis=1).copy()
y = house.median_house_value


split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=123)
(train_idx, test_idx), = split.split(X, X['ocean_proximity'])

X_train = X.iloc[train_idx]
X_test = X.iloc[test_idx]
y_train = y.iloc[train_idx]
y_test = y.iloc[test_idx]


num_features = X.select_dtypes('float').columns
cat_features = X.select_dtypes('object').columns


num_pipline = Pipeline([
    ('impute', SimpleImputer()),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(sparse_output=False))
])


final_pipeline = ColumnTransformer([
    ('num_pipeline', num_pipline, num_features),
    ('cat_pipeline', cat_pipeline, cat_features)
])

final_pipeline.fit(X_train)

X_train_tr = final_pipeline.transform(X_train)
X_test_tr = final_pipeline.transform(X_test)



models = [
    ('linear_regression', LinearRegression()),
    ('SGD', SGDRegressor()),
    ('Random_forest', RandomForestRegressor()),
    ('Tree', DecisionTreeRegressor()),
    ('neighbores', KNeighborsRegressor()),
    ('SVM', SVR())
]

predict(models, X_train_tr, X_test_tr, y_train, y_test)

linear_regression
Training error: 49904.43
Training accuracy: 0.65
____________________________________________________________________________________________________
Testing error: 49650.60
Testing accuracy: 0.64

SGD
Training error: 51221.07
Training accuracy: 0.63
____________________________________________________________________________________________________
Testing error: 50912.36
Testing accuracy: 0.62

Random_forest
Training error: 11685.15
Training accuracy: 0.98
____________________________________________________________________________________________________
Testing error: 31574.64
Testing accuracy: 0.81

Tree
Training error: 0.00
Training accuracy: 1.00
____________________________________________________________________________________________________
Testing error: 43331.44
Testing accuracy: 0.64

neighbores
Training error: 33036.79
Training accuracy: 0.82
____________________________________________________________________________________________________
Testing er

## feature engineering

In [9]:
lat_long = X['latitude']  + X['longitude']
lat_long

0       -84.35
1       -84.36
2       -84.39
3       -84.40
4       -84.40
         ...  
20635   -81.61
20636   -81.72
20637   -81.79
20638   -81.89
20639   -81.87
Length: 20640, dtype: float64

In [10]:
lat_long.corr(y)

-0.49151967882957753

In [11]:
X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY
...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,INLAND


In [12]:
new = X['housing_median_age'] + X['median_income']
new.corr(y)

0.21088126792906592

In [13]:
new = X['housing_median_age'] / X['median_income']
new.corr(y)

-0.320027568914684

In [14]:
(X['total_rooms'] / X['households']).corr(y)

0.15194828974145758

In [15]:
(X['total_rooms'] / X['population']).corr(y)


0.2094819690066896

In [17]:
(X['total_bedrooms'] / X['population']).corr(y)


0.06989602070580718

In [18]:
new = X['housing_median_age'] / X['population']
new.corr(y)

0.01697036683142879

In [19]:
X_train_tr

array([[ 0.62286403, -0.76422645,  1.70268675, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.56806326, -0.68937821,  0.4311512 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.66271915, -0.75954843,  1.54374481, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.62286403, -0.77358248,  1.22586092, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.21072692, -1.19928181, -1.31721019, ...,  0.        ,
         0.        ,  0.        ],
       [-0.85675696,  1.05552126, -0.20461658, ...,  0.        ,
         0.        ,  0.        ]])

    every transformers required two function 
    
    fit   ---> parameters estimation

    trasform  ----> transform the data

In [20]:
X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY
...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,INLAND


In [27]:
class Add_features:

    def __init__(self):
        self.lon = 0
        self.lat = 1
        self.hma = 2
        self.trms = 3
        self.pop = 5
        self.med_inc = 7

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        lat_long = X[:, self.lon] + X[:, self.lat]
        hma_med_inc = X[:, self.hma] / X[:, self.med_inc]
        trms_pop = X[:, self.trms] / X[:, self.pop]

        return np.c_[X, lat_long, hma_med_inc, trms_pop]
        

In [28]:
obj = Add_features()

In [30]:
obj.transform(X_train_tr).shape

(16512, 16)

In [31]:
X_train_tr.shape

(16512, 13)

In [23]:
arr = np.arange(1,21).reshape(4,5)
arr

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15],
       [16, 17, 18, 19, 20]])

In [25]:
arr[:, 0]

array([ 1,  6, 11, 16])

In [26]:
arr[:, 1]


array([ 2,  7, 12, 17])

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.model_selection  import StratifiedShuffleSplit

from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.compose import ColumnTransformer
from predict_function import predict

from sklearn.base import BaseEstimator, TransformerMixin


class Add_features:

    def __init__(self):
        self.lon = 0
        self.lat = 1
        self.hma = 2
        self.trms = 3
        self.pop = 5
        self.med_inc = 7

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        lat_long = X[:, self.lon] + X[:, self.lat]
        hma_med_inc = X[:, self.hma] / X[:, self.med_inc]
        trms_pop = X[:, self.trms] / X[:, self.pop]

        return np.c_[X, lat_long, hma_med_inc, trms_pop]
        


house = pd.read_csv('../housing.csv')

X = house.drop('median_house_value', axis=1).copy()
y = house.median_house_value


split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=123)
(train_idx, test_idx), = split.split(X, X['ocean_proximity'])

X_train = X.iloc[train_idx]
X_test = X.iloc[test_idx]
y_train = y.iloc[train_idx]
y_test = y.iloc[test_idx]


num_features = X.select_dtypes('float').columns
cat_features = X.select_dtypes('object').columns


num_pipline = Pipeline([
    ('impute', SimpleImputer()),
    ('add_feature', Add_features()),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(sparse_output=False))
])


final_pipeline = ColumnTransformer([
    ('num_pipeline', num_pipline, num_features),
    ('cat_pipeline', cat_pipeline, cat_features)
])

final_pipeline.fit(X_train)

X_train_tr = final_pipeline.transform(X_train)
X_test_tr = final_pipeline.transform(X_test)



models = [
    ('linear_regression', LinearRegression()),
    ('SGD', SGDRegressor()),
    ('Random_forest', RandomForestRegressor()),
    ('Tree', DecisionTreeRegressor()),
    ('neighbores', KNeighborsRegressor()),
    ('SVM', SVR())
]

predict(models, X_train_tr, X_test_tr, y_train, y_test)

linear_regression
Training error: 49440.07
Training accuracy: 0.65
____________________________________________________________________________________________________
Testing error: 49172.55
Testing accuracy: 0.65

SGD
Training error: 49575.57
Training accuracy: 0.65
____________________________________________________________________________________________________
Testing error: 49309.86
Testing accuracy: 0.64

Random_forest
Training error: 10962.27
Training accuracy: 0.98
____________________________________________________________________________________________________
Testing error: 29896.59
Testing accuracy: 0.83

Tree
Training error: 0.00
Training accuracy: 1.00
____________________________________________________________________________________________________
Testing error: 41730.08
Testing accuracy: 0.65

neighbores
Training error: 30399.42
Training accuracy: 0.84
____________________________________________________________________________________________________
Testing er

In [33]:
final_pipeline

In [34]:
final_pipeline.named_transformers_

{'num_pipeline': Pipeline(steps=[('impute', SimpleImputer()),
                 ('add_feature',
                  <__main__.Add_features object at 0x000001569D21B810>),
                 ('scale', StandardScaler())]),
 'cat_pipeline': Pipeline(steps=[('one_hot_encoder', OneHotEncoder(sparse_output=False))])}

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.model_selection  import StratifiedShuffleSplit

from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.compose import ColumnTransformer
from predict_function import predict

from sklearn.base import BaseEstimator, TransformerMixin


class Add_features(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.lon = 0
        self.lat = 1
        self.hma = 2
        self.trms = 3
        self.pop = 5
        self.med_inc = 7

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        lat_long = X[:, self.lon] + X[:, self.lat]
        hma_med_inc = X[:, self.hma] / X[:, self.med_inc]
        trms_pop = X[:, self.trms] / X[:, self.pop]

        return np.c_[X, lat_long, hma_med_inc, trms_pop]
        


house = pd.read_csv('../housing.csv')

X = house.drop('median_house_value', axis=1).copy()
y = house.median_house_value


split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=123)
(train_idx, test_idx), = split.split(X, X['ocean_proximity'])

X_train = X.iloc[train_idx]
X_test = X.iloc[test_idx]
y_train = y.iloc[train_idx]
y_test = y.iloc[test_idx]


num_features = X.select_dtypes('float').columns
cat_features = X.select_dtypes('object').columns


num_pipline = Pipeline([
    ('impute', SimpleImputer()),
    ('add_feature', Add_features()),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(sparse_output=False))
])


final_pipeline = ColumnTransformer([
    ('num_pipeline', num_pipline, num_features),
    ('cat_pipeline', cat_pipeline, cat_features)
])

final_pipeline.fit(X_train)

X_train_tr = final_pipeline.transform(X_train)
X_test_tr = final_pipeline.transform(X_test)



models = [
    ('linear_regression', LinearRegression()),
    ('SGD', SGDRegressor()),
    ('Random_forest', RandomForestRegressor(n_jobs=-1)),
    ('Tree', DecisionTreeRegressor()),
    ('neighbores', KNeighborsRegressor()),
]

predict(models, X_train_tr, X_test_tr, y_train, y_test)

linear_regression
Training error: 49440.07
Training accuracy: 0.65
____________________________________________________________________________________________________
Testing error: 49172.55
Testing accuracy: 0.65

SGD
Training error: 49284.61
Training accuracy: 0.65
____________________________________________________________________________________________________
Testing error: 48980.88
Testing accuracy: 0.65

Random_forest
Training error: 11051.35
Training accuracy: 0.98
____________________________________________________________________________________________________
Testing error: 30084.93
Testing accuracy: 0.83

Tree
Training error: 0.00
Training accuracy: 1.00
____________________________________________________________________________________________________
Testing error: 41290.88
Testing accuracy: 0.66

neighbores
Training error: 30399.42
Training accuracy: 0.84
____________________________________________________________________________________________________
Testing er

In [36]:
final_pipeline

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.model_selection  import StratifiedShuffleSplit

from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.compose import ColumnTransformer
from predict_function import predict

from sklearn.base import BaseEstimator, TransformerMixin


class Add_features(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.lon = 0
        self.lat = 1
        self.hma = 2
        self.trms = 3
        self.pop = 5
        self.med_inc = 7

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        lat_long = X[:, self.lon] + X[:, self.lat]
        hma_med_inc = X[:, self.hma] / X[:, self.med_inc]
        trms_pop = X[:, self.trms] / X[:, self.pop]

        return np.c_[X, lat_long, hma_med_inc, trms_pop]
        


house = pd.read_csv('../housing.csv')

X = house.drop('median_house_value', axis=1).copy()
y = house.median_house_value


split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=123)
(train_idx, test_idx), = split.split(X, X['ocean_proximity'])

X_train = X.iloc[train_idx]
X_test = X.iloc[test_idx]
y_train = y.iloc[train_idx]
y_test = y.iloc[test_idx]


num_features = X.select_dtypes('float').columns
cat_features = X.select_dtypes('object').columns
pass_features = ['median_income']

num_pipline = Pipeline([
    ('impute', SimpleImputer()),
    ('add_feature', Add_features()),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(sparse_output=False))
])


final_pipeline = ColumnTransformer([
    ('num_pipeline', num_pipline, num_features),
    ('cat_pipeline', cat_pipeline, cat_features),
    ('pass', 'passthrough', pass_features)
])

final_pipeline.fit(X_train)

X_train_tr = final_pipeline.transform(X_train)
X_test_tr = final_pipeline.transform(X_test)




In [6]:
X_train_tr

array([[ 0.62286403, -0.76422645,  1.70268675, ...,  0.        ,
         0.        ,  2.4583    ],
       [ 0.56806326, -0.68937821,  0.4311512 , ...,  0.        ,
         0.        ,  4.0347    ],
       [ 0.66271915, -0.75954843,  1.54374481, ...,  0.        ,
         0.        ,  2.375     ],
       ...,
       [ 0.62286403, -0.77358248,  1.22586092, ...,  0.        ,
         0.        ,  2.2396    ],
       [ 1.21072692, -1.19928181, -1.31721019, ...,  0.        ,
         0.        ,  4.3513    ],
       [-0.85675696,  1.05552126, -0.20461658, ...,  0.        ,
         0.        ,  5.7485    ]])

In [7]:
X_train_tr[:5]

array([[ 0.62286403, -0.76422645,  1.70268675, -0.20580797, -0.18713319,
        -0.28931321, -0.08195304, -0.74260439, -0.47034561,  1.49620224,
         0.01508827,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  2.4583    ],
       [ 0.56806326, -0.68937821,  0.4311512 , -0.08288557, -0.11840748,
        -0.25279895, -0.09752599,  0.08309466, -0.40900665, -0.14694896,
         0.15408696,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  4.0347    ],
       [ 0.66271915, -0.75954843,  1.54374481, -1.02210375, -1.04264972,
        -0.82137824, -1.01373439, -0.78623591, -0.35993548,  1.47844531,
        -0.98737654,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  2.375     ],
       [ 0.68762859, -0.82971865,  1.06691898, -0.29230892, -0.25585889,
        -0.32148054, -0.26363743, -0.09059337, -0.4826134 ,  0.25512973,
        -0.07809926,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        

In [2]:
house

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [3]:
house.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0
