In [28]:
#importing the libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Reading and Understanding the Data

In [29]:
data = pd.read_csv('newdata.csv')
data.head()

Unnamed: 0,City,Name,Year,Shell,Volume,Mileage,Transmission,Rudder,Color,Gear,CustomsCleared,Type Engine,Price
0,Алматы,BMWX5,2007,кроссовер,3.0,161000,автомат,слева,серебристый,полный привод,Да,бензин,7400000
1,Усть-Каменогорск,ВАЗ (Lada)2114 (хэтчбек),2013,хэтчбек,1.6,57000,механика,слева,белый,передний привод,Да,бензин,1700000
2,Алматы,Mercedes-BenzE 200,2008,седан,2.2,209000,автомат,слева,черный металлик,задний привод,Нет,дизель,3500000
3,Шымкент,ВАЗ (Lada)2190 (седан),2018,седан,1.6,19500,механика,слева,серебристый металлик,передний привод,Да,бензин,3450000
4,Шымкент,LexusLX 570,2017,внедорожник,5.7,43000,автомат,слева,белый,,Да,бензин,40000000


In [30]:
data = data[data['Gear'].notna()]
data.drop('City', axis = 1, inplace = True)
data.drop('Name', axis = 1, inplace = True)
data.drop('Color', axis = 1, inplace = True)
data.drop('Shell', axis = 1, inplace = True)

In [31]:
data['Year'] = 2020 - data['Year']
data = data[['Price','Volume','Year','Mileage',
             'Transmission','Rudder','Gear','CustomsCleared','Type Engine']]

In [32]:
data=pd.get_dummies(data, drop_first=True)

In [33]:
data.head()

Unnamed: 0,Price,Volume,Year,Mileage,Transmission_механика,Transmission_типтроник,Rudder_справа,Gear_передний привод,Gear_полный привод,CustomsCleared_Нет,Type Engine_газ-бензин,Type Engine_дизель
0,7400000,3.0,13,161000,0,0,0,0,1,0,0,0
1,1700000,1.6,7,57000,1,0,0,1,0,0,0,0
2,3500000,2.2,12,209000,0,0,0,0,0,1,0,1
3,3450000,1.6,2,19500,1,0,0,1,0,0,0,0
5,13500000,4.7,9,145,0,0,0,0,1,0,0,0


In [34]:
# independent feature
X = data.iloc[:, 1:]

y = data.iloc[:, 0]

In [35]:
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
model.fit(X,y)

ExtraTreesRegressor()

In [36]:
print(model.feature_importances_)

[0.21283159 0.33459035 0.15348773 0.06383627 0.00773631 0.00546047
 0.00561473 0.19483818 0.00795967 0.00238479 0.01125992]


In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [38]:
from sklearn.ensemble import RandomForestRegressor
rf_random = RandomForestRegressor()

In [39]:
## Hyperparameters
n_estimators = [int(x) for x in np.linspace(start=100 ,stop=1200, num=12)]
print(n_estimators)

[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]


In [40]:
# Randomized search CV

# no of trees in random forest 
n_estimators = [int(x) for x in np.linspace(start=100 ,stop=1200, num=12)]

# No of features to consider at every split 
max_features = ['auto', 'sqrt']

# Maximum no of levels in a tree
max_depth = [int(x) for x in np.linspace(5, 30, num=6)]

# minimum number of samples required to split a node 
min_samples_split = [2, 5, 10, 15, 100]

# minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [41]:
from sklearn.model_selection import RandomizedSearchCV
random_grid = { 'n_estimators':n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf}
print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}


In [42]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, 
                               scoring='neg_mean_squared_error', n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=1)

In [43]:
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   2.4s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.3s remaining:    0.0s


[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   2.5s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 
[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   2.5s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 
[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   2.9s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 
[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   3.7s
[CV] n_estimators=1100, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=15 
[CV]  n_estimators=1100, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=15, total=   4.5s
[CV] n_estimators=1100, min_samples_split=10, mi

[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   2.0s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   2.8s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   2.3s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   1.8s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   3.1s
[CV] n_estimators=700, min_samples_split=15, min_sam

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.0min finished


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=1,
                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 5, 10],
                                        'min_samples_split': [2, 5, 10, 15,
                                                              100],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100,
                                                         1200]},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)

In [44]:
predictions = rf_random.predict(X_test)
predictions

array([ 5424355.65730108,  1390703.45709765,  8748787.49250808,
       25320603.33348195,  1793228.68686358, 11615738.04779205,
        4709434.28888846,  6611920.14781491,  4407569.42754019,
        7835478.37690328, 11686950.95268819,  1469605.72851068,
        3545958.95545124,  3777720.87203168,  1484323.59636104,
       13676592.38762864,  2809649.49533413, 13945815.6589872 ,
        2717735.74801981,  7576325.87721842,  2803873.07812704,
        7640088.68979653,  4576942.22464219, 34534389.49941084,
        3155229.01396438, 32697617.41700118,  8116833.44406052,
       19178484.53979755,   999431.80306138,  1908446.47590701,
        3729080.21466067,  8381830.72531038,   835475.0449238 ,
        8122054.21590665,  3705655.30647531, 11022608.54061251,
        3089445.37253011,  2474085.87111464,  7146434.15191633,
       16422594.14512702,  3161084.65135655, 12931357.32936164,
        5767483.29058757,  2546593.70974003,  3754890.35661308,
       19636578.72958753, 10206306.02289

In [45]:
import pickle

# open a file where you want to store the data
file = open('modelNewData.pkl', 'wb')

# dump info to that file
pickle.dump(rf_random, file)

In [55]:
test = pickle.load(open('modelNewData.pkl','rb'))
prediction = test.predict([[1.6,2,19500,1,0,0,1,0,0,0,0]])

In [56]:
print(prediction)

[4119564.50990863]
