In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [2]:
csv_cruise = r'C:\Users\Dips\Desktop\Data Science work\Data to Test Methods\Cruise Ship Info.csv'

In [3]:
data=pd.read_csv(csv_cruise,  index_col='Ship_name')

In [4]:
data.head()

Unnamed: 0_level_0,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew
Ship_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Journey,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55
Quest,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55
Celebration,Carnival,26,47.262,14.86,7.22,7.43,31.8,6.7
Conquest,Carnival,11,110.0,29.74,9.53,14.88,36.99,19.1
Destiny,Carnival,17,101.353,26.42,8.92,13.21,38.36,10.0


In [5]:
data.corr()

Unnamed: 0,Age,Tonnage,passengers,length,cabins,passenger_density,crew
Age,1.0,-0.606646,-0.515542,-0.532286,-0.510019,-0.27883,-0.530657
Tonnage,-0.606646,1.0,0.945061,0.922368,0.948764,-0.040846,0.927569
passengers,-0.515542,0.945061,1.0,0.883535,0.976341,-0.294867,0.915234
length,-0.532286,0.922368,0.883535,1.0,0.889798,-0.090488,0.895857
cabins,-0.510019,0.948764,0.976341,0.889798,1.0,-0.253181,0.950823
passenger_density,-0.27883,-0.040846,-0.294867,-0.090488,-0.253181,1.0,-0.155509
crew,-0.530657,0.927569,0.915234,0.895857,0.950823,-0.155509,1.0


In [6]:
y=data.crew
features = ['Tonnage','passengers','length','cabins']
X=data[features]

In [7]:
X.describe()
y.describe()

count    158.000000
mean       7.794177
std        3.503487
min        0.590000
25%        5.480000
50%        8.150000
75%        9.990000
max       21.000000
Name: crew, dtype: float64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123)

In [9]:
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

In [10]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [11]:
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
clf.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestregressor',
                                        RandomForestRegressor())]),
             param_grid={'randomforestregressor__max_depth': [None, 5, 3, 1],
                         'randomforestregressor__max_features': ['auto', 'sqrt',
                                                                 'log2']})

In [12]:
clf.refit

True

In [13]:
pred = clf.predict(X_test)

In [14]:
r2_score(y_test, pred)

0.9383296372587491

In [15]:
mean_squared_error(y_test, pred)

0.6229946247292417

In [16]:
y_test

Ship_name
Century          8.58
Victory         11.50
PaulGauguin      2.11
Nautica          4.00
Radiance         8.68
Insignia         4.00
Regatta          4.00
Pride            1.60
Virgo           12.00
Surf             1.80
Sovreign         8.08
Eurodam          8.00
Splendour        7.20
Legend           1.60
Fascination      9.20
Zuiderdam        8.00
Romantica        6.00
Crown            4.70
Navigator       11.85
Mediterranea     9.20
Inspiration      9.20
Liberty         13.60
Summit           9.99
Zenith           6.70
Classica         6.17
Veendam          5.88
Rhapsody         2.97
Ryndam           5.88
Spirit          10.29
Europa           6.36
Mercury          9.09
Statendam        5.88
Name: crew, dtype: float64