In [1]:
import pandas as pd
import numpy as np

balance_df = pd.read_excel('S:/Car/ML ready data/balance_frame.xlsx')
print(balance_df.shape)


(8369, 36)


In [30]:
balance_df.model.value_counts()

model
63     362
179    349
90     331
157    295
77     260
      ... 
115      1
102      1
100      1
0        1
283      1
Name: count, Length: 266, dtype: int64

In [31]:
features = ['Gear', 'Year of Manufacture', 'modelYear', 'km', 'Top Speed', 'transmission',
            'Mileage','City','Color','bt','ownerNo','Insurance Validity','Fuel Type','model']

In [14]:
# Outliers removal using IQR

Q1 = balance_df.quantile(0.05)
Q3 = balance_df.quantile(0.95)
IQR = Q3 - Q1
balance_df = balance_df[~((balance_df < (Q1 - 1.5 * IQR)) |(balance_df > (Q3 + 1.5 * IQR))).any(axis=1)]
balance_df.shape

(7697, 36)

In [34]:
#scaling the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
balance_df[features] = scaler.fit_transform(balance_df[features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  balance_df[features] = scaler.fit_transform(balance_df[features])


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
X = balance_df[features]
y = balance_df['price_in_lakhs']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
r2_score(y_test, y_pred)

Mean Squared Error: 7.427372702917507


0.9117254584517427

In [36]:
importances = model.feature_importances_
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
feature_importances

Unnamed: 0,Feature,Importance
0,Gear,0.553093
1,Year of Manufacture,0.086063
2,modelYear,0.082955
4,Top Speed,0.064019
6,Mileage,0.063939
13,model,0.049425
3,km,0.034655
9,bt,0.016744
5,transmission,0.012592
12,Fuel Type,0.011746


In [42]:
parameters = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True],
    'criterion': ['squared_error', 'absolute_error'],
    'max_features': ['auto', 'sqrt'],
}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=parameters, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)


144 fits failed out of a total of 192.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "s:\Car\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "s:\Car\.venv\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "s:\Car\.venv\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "s:\Car\.venv\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.Invalid

{'bootstrap': True, 'criterion': 'squared_error', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [43]:
best_params

{'bootstrap': True,
 'criterion': 'squared_error',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [44]:
#Training the model with best parameters
model = RandomForestRegressor(**best_params)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
r2_score(y_test, y_pred)

Mean Squared Error: 8.615634872520186


0.8976029278535931

In [45]:
import pickle
pickle.dump(model,open('random_forest_model.pkl', 'wb'))

In [None]:
#model saving
import pickle
pickle.dump(model, open('model.pkl', 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))
pickle.dump(feature_importances, open('feature_importances.pkl', 'wb'))
pickle.dump(balance_df, open('balance_df.pkl', 'wb'))
pickle.dump(X, open('X.pkl', 'wb'))
pickle.dump(y, open('y.pkl', 'wb'))
pickle.dump(features, open('features.pkl', 'wb'))
pickle.dump(importances, open('importances.pkl', 'wb'))