In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import time

In [9]:
dataset = pd.read_parquet('../data/clean/50_Startups.parquet')

In [10]:
dataset.columns

Index(['RND', 'ADMIN', 'MRK_S', 'STATE', 'PROFIT'], dtype='object')

In [11]:
target = ['PROFIT']
numerical_features = ['RND', 'ADMIN', 'MRK_S']
categorical_features = ['STATE']

In [12]:
X = dataset.drop(columns=target, axis=1)
y = dataset[target]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [14]:
regressors = [
    LinearRegression(),
    SVR(),
    DecisionTreeRegressor(random_state=0),
    RandomForestRegressor(n_estimators=100, random_state=0)
]

In [None]:
num_transformers = [StandardScaler(), MinMaxScaler()]
cat_transformers = [OneHotEncoder(drop='first', handle_unknown='ignore')]

In [16]:
results = []

for reg in regressors:
    for num_tr in num_transformers:
        for cat_tr in cat_transformers:

            preprocessor = ColumnTransformer([
                ('num', num_tr, numerical_features),
                ('cat', cat_tr, categorical_features)
            ], remainder='drop')

            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('regressor', reg)
            ])

            start = time.time()
            pipeline.fit(X_train, y_train.squeeze())
            end = time.time()

            y_pred = pipeline.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)

            results.append({
                'model': reg.__class__.__name__,
                'num_transformer': num_tr.__class__.__name__,
                'cat_transformer': cat_tr.__class__.__name__,
                'mse': mse,
                'train_time': round(end - start, 2)
            })

# Do DataFrame
pd.set_option('display.float_format', lambda x: '%.2f' % x)
results_df = pd.DataFrame(results)
print(results_df.sort_values(by='mse'))

                   model num_transformer cat_transformer           mse  \
5  DecisionTreeRegressor    MinMaxScaler   OneHotEncoder   35225563.35   
4  DecisionTreeRegressor  StandardScaler   OneHotEncoder   35225563.35   
7  RandomForestRegressor    MinMaxScaler   OneHotEncoder   38318880.21   
6  RandomForestRegressor  StandardScaler   OneHotEncoder   38318880.21   
1       LinearRegression    MinMaxScaler   OneHotEncoder   83502864.03   
0       LinearRegression  StandardScaler   OneHotEncoder   83502864.03   
2                    SVR  StandardScaler   OneHotEncoder 1483093113.82   
3                    SVR    MinMaxScaler   OneHotEncoder 1483260464.42   

   train_time  
5        0.01  
4        0.01  
7        0.18  
6        0.24  
1        0.01  
0        0.01  
2        0.01  
3        0.01  


The Decision Tree Regressor recived the best results (lowest MSE)