In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder

# from sklearn.lin


In [2]:
from sklearn.preprocessing import OneHotEncoder
ge = ColumnTransformer(
    [('OneHotEncode', OneHotEncoder(), slice(0, None))], remainder='passthrough')
df = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                  columns=['first', 'second', 'third'])
ge.fit_transform(df)[:, slice(0, None)]

# ge.output_indices_


array([[1., 0., 0., 1., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 1., 0., 0., 1.]])

In [3]:
from sklearn.svm import SVR


class CustomSVR(SVR):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def fit(self, X, y):
        # self.y_shape = y.shape[1]
        y_new = y.reshape(-1) if y.ndim > 1 else y
        return super().fit(X, y_new)

    def predict(self, X, *args, **kwargs):
        y_pred = super().predict(X, *args, **kwargs)
        return y_pred.reshape(-1, 1)


In [4]:
test = pd.DataFrame(data=['No', 'Yes', 'No', 'No', 'Yes',
                    'Yes', 'No', 'Yes', 'No', 'Yes'], columns=['Answer'])
label_encoder = LabelEncoder()
test['Answer'] = label_encoder.fit_transform(test[['Answer']])
test


  y = column_or_1d(y, warn=True)


Unnamed: 0,Answer
0,0
1,1
2,0
3,0
4,1
5,1
6,0
7,1
8,0
9,1


In [5]:
class FalseScaler:
    def __init__(self):
        pass

    def fit(self, X):
        return X

    def transform(self, X):
        return X

    def fit_transform(self, X):
        return X

    def inverse_transform(self, X):
        return X


In [6]:
class ModelManager:
    def __init__(self):
        pass

    def load_and_preprocess_data(self, data: str, drop=[], X_slice=slice(0, -1), y_slice=-1, x_label=[],
        y_label=False, columns_to_encode=[], columns_to_scale=[], scale_y=True, random_state=None, shuffle=True, drop_first=True):
        """load dataset from csv file and preprocess it

        Args:
            data (str): path to dataset csv file
            drop (list, optional): columns to drop from dataset. Defaults to [].
            X_slice (slice, optional): independent variables slice. Defaults to slice(0, -1).
            y_slice (int|slice, optional): dependent variables slice. Defaults to -1.
            x_label (list, optional): independent variables to be label encoded. Defaults to [].
            y_label (bool, optional): label encode dependent variables. Defaults to False.
            columns_to_encode (list, optional): columns to get_dummies. Defaults to [].
            columns_to_scale (list, optional): columns to normalize. Defaults to [].
            scale_y (bool, optional): normalize the dependent variables. Defaults to True.
            random_state (int, optional): train test split random state. Defaults to None.
            drop_first (bool, optional): drop the first dummy column. Defaults to True.
        """
        self.dataset = pd.read_csv(data).drop(drop, axis=1)
        self.X = self.dataset.iloc[:, X_slice]
        self.y = self.dataset.iloc[:, y_slice].values.reshape(-1, 1)

        # self.X_label_encoder= ColumnTransformer([('Label',LabelEncoder(), x_labels)], remainder='passthrough')
        # self.X = self.X_label_encoder.fit_transform(self.X)
        self.X_label_encoder = LabelEncoder()
        if x_label != []:
            for label in x_label:
                self.X[label] = self.X_label_encoder.fit_transform(
                    self.X[label])
        # self.X = self.X_label_encoder.fit_transform(self.X)
        self.y_label_encoder = LabelEncoder()
        self.y = self.y_label_encoder.fit_transform(
            self.y.ravel()).reshape(-1, 1) if y_label else self.y

        # self.column_transformer = ColumnTransformer([('OneHotEncode', OneHotEncoder(drop='first'), columns_to_encode)], remainder='passthrough')
        # self.X = self.column_transformer.fit_transform(self.X)
        if columns_to_encode != []:
            self.X = pd.get_dummies(
                self.X, columns=columns_to_encode, drop_first=drop_first)

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.20, random_state=random_state,shuffle=shuffle)

        self.X_scaler = ColumnTransformer(
            [('Scaler', StandardScaler(), columns_to_scale)], remainder='passthrough')
        # self.y_scaler = ColumnTransformer([('Scaler', StandardScaler(), scale_y)], remainder='passthrough')
        # self.X_scaler = StandardScaler()
        self.y_scaler = StandardScaler() if scale_y else FalseScaler()
        self.X_train_scaled = pd.DataFrame(data=self.X_scaler.fit_transform(self.X_train), columns=self.X_train.columns)
        self.X_test_scaled = pd.DataFrame(data=self.X_scaler.transform(self.X_test), columns=self.X_test.columns)
        self.y_train_scaled = self.y_scaler.fit_transform(self.y_train)
        self.y_test_scaled = self.y_scaler.transform(self.y_test)
        self.X_tf_validation, self.X_tf_test,self.y_tf_validation, self.y_tf_test = train_test_split(self.X_test_scaled,self.y_test, test_size=0.5)
        self.y_tf_validation = self.y_scaler.transform(self.y_tf_validation)


    def fit(self, *, degree=4, kernel='rbf', random_state=None, n_estimators=5):
        from sklearn.linear_model import LinearRegression
        self.lin_reg = LinearRegression()
        self.lin_reg.fit(self.X_train_scaled, self.y_train_scaled)

        from sklearn.preprocessing import PolynomialFeatures
        self.poly_features = PolynomialFeatures(degree=degree)
        self.X_poly = self.poly_features.fit_transform(self.X_train_scaled)
        self.poly_reg = LinearRegression()
        self.poly_reg.fit(self.X_poly, self.y_train_scaled)

        self.svr_reg = CustomSVR(kernel=kernel)
        self.svr_reg.fit(self.X_train_scaled, self.y_train_scaled)

        from sklearn.tree import DecisionTreeRegressor
        self.dec_tree_reg = DecisionTreeRegressor(random_state=random_state)
        self.dec_tree_reg.fit(self.X_train, self.y_train.ravel())

        from sklearn.ensemble import RandomForestRegressor
        self.rand_forest_reg = RandomForestRegressor(
            n_estimators=n_estimators, random_state=random_state)
        self.rand_forest_reg.fit(self.X_train, self.y_train.ravel())

        import tensorflow as tf
        output_size = 1

        batch_size = 50

        input_size = 9

        max_epochs = 1000

        hidden_layer_size = 500

        self.tf_model = tf.keras.Sequential([
            # tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
            # tf.keras.layers.Dense(hidden_layer_size,activation='tanh'),
            tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
            tf.keras.layers.Dense(hidden_layer_size,activation='sigmoid'),
            tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
            tf.keras.layers.Dense(output_size),
        ])

        self.tf_model.compile(optimizer='SGD', loss='mean_squared_error', metrics=['accuracy'])
        early_stopper = tf.keras.callbacks.EarlyStopping(patience=300)

        self.tf_model.fit(
            self.X_train_scaled,
            self.y_train_scaled,
            epochs=max_epochs,
            # batch_size=batch_size,
            validation_data=(self.X_tf_validation,self.y_tf_validation),
            verbose=2,
            callbacks=[early_stopper]
            )

    def predict(self):
        self.y_lin_reg = self.y_scaler.inverse_transform(
            self.lin_reg.predict(self.X_test_scaled))
        self.y_poly_reg = self.y_scaler.inverse_transform(
            self.poly_reg.predict(self.poly_features.transform(self.X_test_scaled)))
        self.y_svr_reg = self.y_scaler.inverse_transform(
            self.svr_reg.predict(self.X_test_scaled))
        self.y_dec_tree_reg = self.dec_tree_reg.predict(
            self.X_test).reshape(-1, 1)
        self.y_rand_forest_reg = self.rand_forest_reg.predict(
            self.X_test).reshape(-1, 1)
        self.y_tf_pred_raw = self.y_scaler.inverse_transform(self.tf_model.predict(self.X_tf_test))
        # self.y_tf_pred_raw = self.tf_model.predict(self.X_test_scaled)
        

        return {
            "Linear": self.y_lin_reg,
            "Poly": self.y_poly_reg,
            "SVR": self.y_svr_reg,
            "DecisionTree": self.y_dec_tree_reg,
            "RandomForest": self.y_rand_forest_reg,
            "Tensorflow": self.y_tf_pred_raw
        }

    def r2_score(self):
        from sklearn.metrics import r2_score
        return {
            "Linear": r2_score(self.y_test, self.y_lin_reg),
            "Poly": r2_score(self.y_test, self.y_poly_reg),
            "SVR": r2_score(self.y_test, self.y_svr_reg),
            "DecisionTree": r2_score(self.y_test, self.y_dec_tree_reg),
            "RandomForest": r2_score(self.y_test, self.y_rand_forest_reg),
            "Tensorflow": r2_score(self.y_tf_test,self.y_tf_pred_raw)
        }

    def fit_predict(self, *, degree=4, kernel='rbf', random_state=None, n_estimators=5):
        self.fit(degree=degree, kernel=kernel, random_state=random_state,n_estimators=n_estimators)
        return self.predict()

    def fit_predict_r2_score(self, *, degree=4, kernel='rbf', random_state=None, n_estimators=5):
        self.fit_predict(degree=degree, kernel=kernel, random_state=random_state,n_estimators=n_estimators)
        return self.r2_score()


In [7]:
model = ModelManager()


In [8]:
model.load_and_preprocess_data("real_estate_price_size_year_view.csv", X_slice=slice(
    1, None), y_slice=0, x_label=['view'], columns_to_scale=['size','year'], scale_y=True,
    drop_first=True, shuffle=False)

model.fit_predict_r2_score(degree=3, n_estimators=10)


Epoch 1/1000
3/3 - 3s - loss: 3.6587 - accuracy: 0.0000e+00 - val_loss: 13.8159 - val_accuracy: 0.0000e+00
Epoch 2/1000
3/3 - 0s - loss: 7.4440 - accuracy: 0.0000e+00 - val_loss: 1.5922 - val_accuracy: 0.0000e+00
Epoch 3/1000
3/3 - 0s - loss: 1.1619 - accuracy: 0.0000e+00 - val_loss: 1.1259 - val_accuracy: 0.0000e+00
Epoch 4/1000
3/3 - 0s - loss: 1.0337 - accuracy: 0.0000e+00 - val_loss: 0.9521 - val_accuracy: 0.0000e+00
Epoch 5/1000
3/3 - 0s - loss: 1.0027 - accuracy: 0.0000e+00 - val_loss: 0.9402 - val_accuracy: 0.0000e+00
Epoch 6/1000
3/3 - 0s - loss: 1.0073 - accuracy: 0.0000e+00 - val_loss: 0.9196 - val_accuracy: 0.0000e+00
Epoch 7/1000
3/3 - 0s - loss: 0.9997 - accuracy: 0.0000e+00 - val_loss: 0.9115 - val_accuracy: 0.0000e+00
Epoch 8/1000
3/3 - 0s - loss: 1.0090 - accuracy: 0.0000e+00 - val_loss: 0.9418 - val_accuracy: 0.0000e+00
Epoch 9/1000
3/3 - 0s - loss: 0.9995 - accuracy: 0.0000e+00 - val_loss: 0.8986 - val_accuracy: 0.0000e+00
Epoch 10/1000
3/3 - 0s - loss: 1.0101 - accur

{'Linear': 0.9531569679869769,
 'Poly': 0.9568393125770719,
 'SVR': 0.9119796671066777,
 'DecisionTree': 0.8604753404741936,
 'RandomForest': 0.8532562462470674,
 'Tensorflow': 0.9147381963740816}

In [9]:
pd.DataFrame({"Target":model.y_tf_test.reshape(-1), "Prediction":model.y_tf_pred_raw.round(2).reshape(-1)})
# model.y_tf_pred_raw.round(2).reshape(-1)

Unnamed: 0,Target,Prediction
0,376253.808,372690.84375
1,274922.856,273561.25
2,252460.4,235281.5625
3,354512.112,365534.375
4,338078.168,324608.125
5,190909.056,218208.359375
6,282683.544,300660.1875
7,408637.816,390754.1875
8,262477.856,271558.71875
9,327252.112,362266.71875


In [10]:
pd.DataFrame({
    "Target":model.y_test.reshape(-1),
    "Linear": model.y_lin_reg.reshape(-1),
    "Poly": model.y_poly_reg.reshape(-1),
    "SVR": model.y_svr_reg.reshape(-1),
    "DecisionTree": model.y_dec_tree_reg.reshape(-1),
    "RandomForest": model.y_rand_forest_reg.reshape(-1),
})


Unnamed: 0,Target,Linear,Poly,SVR,DecisionTree,RandomForest
0,180307.216,211343.245763,209712.664043,217620.3729,225452.152,220717.2808
1,408637.816,386281.903184,381894.619793,388054.597275,362519.72,364202.152
2,190909.056,211343.245763,209712.664043,217620.3729,225452.152,220717.2808
3,282683.544,287163.704568,292252.011069,293199.965594,302000.92,287888.2744
4,303597.216,296555.839264,299741.898337,300831.153465,291236.586667,294952.577276
5,376253.808,368291.896125,365892.672062,358423.315751,393069.76,393211.0224
6,154282.128,166737.806818,148823.047213,201574.510237,168047.264,176325.7272
7,327252.112,351336.787854,347433.176692,349633.911095,334938.872,336443.7224
8,211904.536,222015.834525,217719.550608,223613.557594,211724.096,206766.0496
9,354512.112,362769.347334,365119.138327,345444.899481,368988.432,382105.2304


In [14]:
model2 = ModelManager()
model2.load_and_preprocess_data("50_Startups.csv", columns_to_encode=[
                                'State'], columns_to_scale=['R&D Spend','Administration','Marketing Spend'])
model2.fit_predict_r2_score()


Epoch 1/1000
2/2 - 2s - loss: 1.5203 - accuracy: 0.0000e+00 - val_loss: 6.8207 - val_accuracy: 0.0000e+00
Epoch 2/1000
2/2 - 0s - loss: 11.3964 - accuracy: 0.0000e+00 - val_loss: 2.4006 - val_accuracy: 0.0000e+00
Epoch 3/1000
2/2 - 0s - loss: 3.3935 - accuracy: 0.0000e+00 - val_loss: 0.4965 - val_accuracy: 0.0000e+00
Epoch 4/1000
2/2 - 0s - loss: 1.1570 - accuracy: 0.0000e+00 - val_loss: 0.4982 - val_accuracy: 0.0000e+00
Epoch 5/1000
2/2 - 0s - loss: 1.0702 - accuracy: 0.0000e+00 - val_loss: 0.5304 - val_accuracy: 0.0000e+00
Epoch 6/1000
2/2 - 0s - loss: 1.0221 - accuracy: 0.0000e+00 - val_loss: 0.5569 - val_accuracy: 0.0000e+00
Epoch 7/1000
2/2 - 0s - loss: 1.0092 - accuracy: 0.0000e+00 - val_loss: 0.5904 - val_accuracy: 0.0000e+00
Epoch 8/1000
2/2 - 0s - loss: 0.9996 - accuracy: 0.0000e+00 - val_loss: 0.5994 - val_accuracy: 0.0000e+00
Epoch 9/1000
2/2 - 0s - loss: 0.9977 - accuracy: 0.0000e+00 - val_loss: 0.6019 - val_accuracy: 0.0000e+00
Epoch 10/1000
2/2 - 0s - loss: 0.9973 - accur

{'Linear': 0.9445383839139081,
 'Poly': -5.999125426043259,
 'SVR': 0.9469090549655071,
 'DecisionTree': 0.935222871904702,
 'RandomForest': 0.8821998303714906,
 'Tensorflow': 0.9047048846373458}

In [12]:
model2.y_scaler.inverse_transform(model.svr_reg.predict(model.X_test_scaled))
model2.X_train

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
9,123334.88,108679.17,304981.62,0,0
49,0.0,116983.8,45173.06,0,0
22,73994.56,122782.75,303319.26,1,0
38,20229.59,65947.93,185265.1,0,1
10,101913.08,110594.11,229160.95,1,0
28,66051.52,182645.56,118148.2,1,0
3,144372.41,118671.85,383199.62,0,1
6,134615.46,147198.87,127716.82,0,0
36,28663.76,127056.21,201126.82,1,0
37,44069.95,51283.14,197029.42,0,0


In [13]:
model2.X_test


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
13,91992.39,135495.07,252664.93,0,0
42,23640.93,96189.63,148001.11,0,0
14,119943.24,156547.42,256512.92,1,0
30,61994.48,115641.28,91131.24,1,0
4,142107.34,91391.77,366168.42,1,0
1,162597.7,151377.59,443898.53,0,0
45,1000.23,124153.04,1903.93,0,1
40,28754.33,118546.05,172795.67,0,0
43,15505.73,127382.3,35534.17,0,1
2,153441.51,101145.55,407934.54,1,0
