In [25]:
import joblib
import pandas as pd
from joblib import dump
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import datasets
import seaborn as sns

In [26]:
#upload dataset
df = sns.load_dataset('penguins')
df.head(2)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female


In [27]:
#remove zero-rows
df = df.dropna(subset=['body_mass_g'])

#Create test and validation subsets
df_train, df_valid = train_test_split(df, test_size=0.25, random_state=12345)

#features:
drop_col = ['species', 'island', 'sex', 'body_mass_g']
features = df.drop(drop_col, axis=1)
features_train = df_train.drop(drop_col, axis=1)
features_valid = df_valid.drop(drop_col, axis=1)
#target:
target = df['body_mass_g']
target_train = df_train['body_mass_g']
target_valid = df_valid['body_mass_g']

#DecisionTree:
model_1 = DecisionTreeRegressor(random_state=12345)
#RandomForest:
model_2 = RandomForestRegressor(n_estimators=3)
#Linear:
model_3 = LinearRegression()

#Train the models
model_1.fit(features_train, target_train)
model_2.fit(features_train, target_train)
model_3.fit(features_train, target_train)

# save the model, name & folder path
#joblib.dump(model, 'z:/For Office/Ignatov/Python/model.joblib')

In [28]:
# Check error for DecisionTreeRegressor
predicted_valid = model_1.predict(features_valid)
rmse = mean_squared_error(target_valid, predicted_valid)**0.5
print("DecisionTreeRegressor :", rmse)

DecisionTreeRegressor : 449.3373674283521


In [29]:
# Check error for RandomForest
for estim in range(10, 51, 10):
    model_2 = RandomForestRegressor(n_estimators=estim, max_depth=10, random_state=12345)
    model_2.fit(features_train, target_train)
    predicted_valid = model_2.predict(features_valid)
    rmse = mean_squared_error(target_valid, predicted_valid)**0.5
    print("RandomForest n_estimators =", estim, ":", rmse)

RandomForest n_estimators = 10 : 375.66635601518163
RandomForest n_estimators = 20 : 359.8091422221994
RandomForest n_estimators = 30 : 362.34907569093315
RandomForest n_estimators = 40 : 358.26430734667815
RandomForest n_estimators = 50 : 357.61460166331284


In [30]:
#choose n_estimator parameter with the smallest error
model_2 = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=12345)
model_2.fit(features_train, target_train)

In [31]:
#Check the error for linear regression model
predicted_valid = model_3.predict(features_valid)
rmse = mean_squared_error(target_valid, predicted_valid)**0.5
print("LinearRegression :", rmse)

LinearRegression : 429.9211669557407


In [32]:
#Choose the suitable model. For random forest, set a parameter
best_model = model_3

In [33]:
predicted_valid = best_model.predict(features)
rmse = mean_squared_error(target, predicted_valid)**0.5
print("best_model rmse :", rmse)

best_model rmse : 392.583716584167


In [34]:
#Train the model on the whole dataset
best_model.fit(features, target)
joblib.dump(model_2, 'c:/Users/IgIgnatov/OneDrive - DXC Production/ML_model.joblib')

['c:/Users/IgIgnatov/OneDrive - DXC Production/ML_model.joblib']