In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.model_selection  import StratifiedShuffleSplit

from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.compose import ColumnTransformer
from predict_function import predict

from sklearn.base import BaseEstimator, TransformerMixin


class Add_features(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.lon = 0
        self.lat = 1
        self.hma = 2
        self.trms = 3
        self.pop = 5
        self.med_inc = 7

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        lat_long = X[:, self.lon] + X[:, self.lat]
        hma_med_inc = X[:, self.hma] / X[:, self.med_inc]
        trms_pop = X[:, self.trms] / X[:, self.pop]

        return np.c_[X, lat_long, hma_med_inc, trms_pop]
        


house = pd.read_csv('../housing.csv')

X = house.drop('median_house_value', axis=1).copy()
y = house.median_house_value


split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=123)
(train_idx, test_idx), = split.split(X, X['ocean_proximity'])

X_train = X.iloc[train_idx]
X_test = X.iloc[test_idx]
y_train = y.iloc[train_idx]
y_test = y.iloc[test_idx]


num_features = X.select_dtypes('float').columns
cat_features = X.select_dtypes('object').columns


num_pipline = Pipeline([
    ('impute', SimpleImputer()),
    ('add_feature', Add_features()),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(sparse_output=False))
])


final_pipeline = ColumnTransformer([
    ('num_pipeline', num_pipline, num_features),
    ('cat_pipeline', cat_pipeline, cat_features)
])

final_pipeline.fit(X_train)

X_train_tr = final_pipeline.transform(X_train)
X_test_tr = final_pipeline.transform(X_test)



models = [
    ('Random_forest', RandomForestRegressor())
]

predict(models, X_train_tr, X_test_tr, y_train, y_test)

Random_forest
Training error: 10982.43
Training accuracy: 0.98
____________________________________________________________________________________________________
Testing error: 29800.95
Testing accuracy: 0.83



In [2]:
from sklearn.model_selection import KFold

In [4]:
folds = KFold()

In [5]:
folds.split(X_train_tr)

<generator object _BaseKFold.split at 0x0000022DDE3F29E0>

In [8]:
c = 1
for i in folds.split(X_train_tr):
    print(f"Split no: {c}: {i}")
    print()
    c+=1


Split no: 1: (array([ 3303,  3304,  3305, ..., 16509, 16510, 16511]), array([   0,    1,    2, ..., 3300, 3301, 3302]))

Split no: 2: (array([    0,     1,     2, ..., 16509, 16510, 16511]), array([3303, 3304, 3305, ..., 6603, 6604, 6605]))

Split no: 3: (array([    0,     1,     2, ..., 16509, 16510, 16511]), array([6606, 6607, 6608, ..., 9905, 9906, 9907]))

Split no: 4: (array([    0,     1,     2, ..., 16509, 16510, 16511]), array([ 9908,  9909,  9910, ..., 13207, 13208, 13209]))

Split no: 5: (array([    0,     1,     2, ..., 13207, 13208, 13209]), array([13210, 13211, 13212, ..., 16509, 16510, 16511]))



In [9]:
X_train_tr

array([[ 0.62286403, -0.76422645,  1.70268675, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.56806326, -0.68937821,  0.4311512 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.66271915, -0.75954843,  1.54374481, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.62286403, -0.77358248,  1.22586092, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.21072692, -1.19928181, -1.31721019, ...,  0.        ,
         0.        ,  0.        ],
       [-0.85675696,  1.05552126, -0.20461658, ...,  0.        ,
         0.        ,  0.        ]])

In [10]:
y_train

5015     120900.0
3769     353600.0
4895     107500.0
7963     167100.0
19675    110700.0
           ...   
3500     179700.0
876      158000.0
5040     108600.0
15540    199600.0
16420    162500.0
Name: median_house_value, Length: 16512, dtype: float64

In [11]:
models = [
    ('Random_forest', RandomForestRegressor())
]

In [15]:
c = 1
for train_idx, test_idx in folds.split(X_train_tr):
    print(f"Split no: {c}")
    tr_X = X_train_tr[train_idx]
    ts_X = X_train_tr[test_idx]
    tr_y = y_train.values[train_idx]
    ts_y = y_train.values[test_idx]
    predict(models, tr_X, ts_X, tr_y, ts_y)
    c+=1


Split no: 1
Random_forest
Training error: 11240.35
Training accuracy: 0.98
____________________________________________________________________________________________________
Testing error: 31086.87
Testing accuracy: 0.83

Split no: 2
Random_forest
Training error: 11335.77
Training accuracy: 0.98
____________________________________________________________________________________________________
Testing error: 30020.69
Testing accuracy: 0.84

Split no: 3
Random_forest
Training error: 11255.22
Training accuracy: 0.98
____________________________________________________________________________________________________
Testing error: 29103.38
Testing accuracy: 0.85

Split no: 4
Random_forest
Training error: 11089.58
Training accuracy: 0.98
____________________________________________________________________________________________________
Testing error: 30350.94
Testing accuracy: 0.83

Split no: 5
Random_forest
Training error: 11188.70
Training accuracy: 0.98
_____________________________

In [17]:
model = RandomForestRegressor()

In [16]:
from sklearn.model_selection import cross_val_score

In [19]:
cross_val_r2_score = cross_val_score(model, X_train_tr, y_train, scoring='r2', cv=5)
cross_val_r2_score

array([0.82710337, 0.83829334, 0.85157527, 0.82860758, 0.83051546])

In [None]:
cross_val_r2_score = cross_val_score(model, X_train_tr, y_train, scoring='', cv=5)
cross_val_r2_score