In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
from utils.utils import find_best_hyperparameters

In [2]:
import warnings
#suppress warnings
warnings.filterwarnings('ignore')

In [5]:
dataset = pd.read_csv("50_Startups.csv")

In [7]:
dataset = pd.get_dummies(dataset,drop_first=True)

In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   Profit           50 non-null     float64
 4   State_Florida    50 non-null     bool   
 5   State_New York   50 non-null     bool   
dtypes: bool(2), float64(4)
memory usage: 1.8 KB


In [11]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,False,True
1,162597.7,151377.59,443898.53,191792.06,False,False
2,153441.51,101145.55,407934.54,191050.39,True,False
3,144372.41,118671.85,383199.62,182901.99,False,True
4,142107.34,91391.77,366168.42,166187.94,True,False


In [9]:
independent_vars = dataset[['R&D Spend', 'Administration', 'Marketing Spend','State_Florida', 'State_New York']]

In [11]:
dependent_var = dataset[["Profit"]]

In [13]:
x_train,x_test,y_train,y_test=train_test_split(independent_vars,dependent_var,test_size=0.30,random_state=0)

In [15]:
param_dict = {
    "C": [10,100,500,1000,2000,3000],
    "kernel": ["linear", "poly", "rbf", "sigmoid"]
}

In [17]:
#print csv header
print("C,kernel,r_score")

best_combo = find_best_hyperparameters(
    param_dict,
    x_train, y_train, x_test, y_test,
    create_regressor_callback=lambda combo: make_pipeline(StandardScaler(), SVR(**combo)),
    print_combo_callback=lambda combo: print(f'{combo["C"]},{combo["kernel"]},{combo["r_score"]}')
)

print("\nBest combination:")
print(f'C={best_combo["C"]}, kernel={best_combo["kernel"]}, r_score={best_combo["r_score"]}')
    

C,kernel,r_score
10,linear,-0.03964494678192798
10,poly,-0.05366720512712608
10,rbf,-0.05680759285862336
10,sigmoid,-0.05471958332940319
100,linear,0.10646819600577351
100,poly,-0.019802139315272305
100,rbf,-0.05072602278128757
100,sigmoid,-0.03045351486430925
500,linear,0.5928977271145746
500,poly,0.11468480742657639
500,rbf,-0.024323348197438532
500,sigmoid,0.07057214489673913
1000,linear,0.7802839882154124
1000,poly,0.26616370931646915
1000,rbf,0.0067683444800727965
1000,sigmoid,0.18506861974160804
2000,linear,0.8767721687716041
2000,poly,0.4810028155606567
2000,rbf,0.06751554270553017
2000,sigmoid,0.39706528684272135
3000,linear,0.895674469433492
3000,poly,0.6370064223754037
3000,rbf,0.12322756620227582
3000,sigmoid,0.5913630209426107

Best combination:
C=3000, kernel=linear, r_score=0.895674469433492
