In [1]:
#!pip3 install optuna
#!pip3 install category-encoders
#!pip3 install xgboost

In [2]:
#pip install --upgrade category-encoders

In [3]:
import numpy as np 
import pandas as pd 
from category_encoders import OneHotEncoder
from sklearn.linear_model import LinearRegression,ElasticNet,Ridge,Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/insurance/insurance.csv


In [4]:
ins=pd.read_csv("/kaggle/input/insurance/insurance.csv")

In [5]:
encoder = OneHotEncoder()
encoder.fit(ins)

In [6]:
insurance = encoder.transform(ins)

In [7]:
features_to_scale = ['bmi','age','charges','children']

In [8]:
scaler=StandardScaler()
scaler.fit(insurance[features_to_scale])
scaled_df = scaler.transform(insurance[features_to_scale])

In [9]:
insurance[features_to_scale] = scaled_df
print(insurance.head())

        age  sex_1  sex_2       bmi  children  smoker_1  smoker_2  region_1  \
0 -1.438764      1      0 -0.453320 -0.908614         1         0         1   
1 -1.509965      0      1  0.509621 -0.078767         0         1         0   
2 -0.797954      0      1  0.383307  1.580926         0         1         0   
3 -0.441948      0      1 -1.305531 -0.908614         0         1         0   
4 -0.513149      0      1 -0.292556 -0.908614         0         1         0   

   region_2  region_3  region_4   charges  
0         0         0         0  0.298584  
1         1         0         0 -0.953689  
2         1         0         0 -0.728675  
3         0         1         0  0.719843  
4         0         1         0 -0.776802  


In [10]:
from sklearn.preprocessing import PolynomialFeatures
X=insurance.drop('charges',axis=1)
quad = PolynomialFeatures (degree = 2)
x_quad = quad.fit_transform(X)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x_quad, insurance['charges'],test_size=0.2,random_state=40)

In [12]:
print(y_train)

737    -0.808401
369    -0.808605
284    -0.301062
1302   -0.831163
958     2.293489
          ...   
1330   -0.052972
1016   -0.872429
165    -0.236532
7      -0.494728
219     0.975702
Name: charges, Length: 1070, dtype: float64


In [13]:
linreg=LinearRegression()
linreg.fit(x_train,y_train)

In [14]:
ridgereg = Ridge(alpha=0.001)
ridgereg.fit(x_train, y_train)

In [15]:
lassoreg = Lasso(alpha=0.001)
lassoreg.fit(x_train,y_train)

In [16]:
elasnet = ElasticNet()
elasnet.fit(x_train,y_train)

In [17]:
print("scores:")
print("linear regression : {}".format(linreg.score(x_test,y_test)))
print("Ridge regression : {}".format(ridgereg.score(x_test,y_test)))
print("Lasso regression: {}".format(lassoreg.score(x_test,y_test)))
print("Elastic Net regression : {}".format(elasnet.score(x_test,y_test)))

scores:
linear regression : 0.8467310460992706
Ridge regression : 0.8467104777345651
Lasso regression: 0.8468447753138417
Elastic Net regression : -0.0004047883552189546


In [18]:
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor(loss='squared_error',alpha=0.001,max_iter=10000)
sgd.fit(x_train,y_train)
print(sgd.score(x_test,y_test))
print(sgd.n_iter_)

0.8461229649487343
10


In [19]:
from xgboost import XGBRegressor
model = XGBRegressor(max_depth = 3, n_estimators = 40, learning_rate = 0.15)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(model.score(x_test,y_test))

0.8525805635873329


In [20]:
from sklearn.metrics import r2_score
print(f'Train R2-Score : {np.round(r2_score(y_train, model.predict(x_train)))*100} %')
print(f'Test R2-Score : {np.round(r2_score(y_test, y_pred)*100)} %')

Train R2-Score : 100.0 %
Test R2-Score : 85.0 %


In [21]:
import optuna

In [22]:
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model1 = XGBRegressor(**param)
    model1.fit(x_train, y_train)
    y_pred = model1.predict(x_test)
    return model1.score(x_test,y_test)

In [23]:
study = optuna.create_study(direction='maximize', study_name='regression')
study.optimize(objective, n_trials=100)

[I 2023-08-29 16:42:09,318] A new study created in memory with name: regression
[I 2023-08-29 16:42:09,648] Trial 0 finished with value: 0.7976129622184962 and parameters: {'max_depth': 2, 'learning_rate': 0.6057213566697314, 'n_estimators': 437, 'min_child_weight': 5, 'gamma': 0.5723485609238939, 'subsample': 0.3514968976470151, 'colsample_bytree': 0.3823187796909359, 'reg_alpha': 0.9869059011551289, 'reg_lambda': 0.16335044609821112, 'random_state': 91}. Best is trial 0 with value: 0.7976129622184962.
[I 2023-08-29 16:42:12,413] Trial 1 finished with value: -1.121376458191674 and parameters: {'max_depth': 9, 'learning_rate': 0.9269746840051438, 'n_estimators': 951, 'min_child_weight': 4, 'gamma': 0.5871623239571607, 'subsample': 0.303702122239197, 'colsample_bytree': 0.6288529141447526, 'reg_alpha': 0.5585474144145678, 'reg_lambda': 0.7818218438665064, 'random_state': 262}. Best is trial 0 with value: 0.7976129622184962.
[I 2023-08-29 16:42:12,771] Trial 2 finished with value: 0.8282

In [24]:
model1 = XGBRegressor(**study.best_params)
model1.fit(x_train, y_train)
y_pred = model1.predict(x_test)

In [25]:
print(model1.score(x_test,y_test))
print(f'Train R2-Score : {np.round(r2_score(y_train, model1.predict(x_train)))*100} %')
print(f'Test R2-Score : {np.round(r2_score(y_test, y_pred)*100)} %')

0.8586738736395434
Train R2-Score : 100.0 %
Test R2-Score : 86.0 %
