# Import Libraries

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pickle

# Read Data

In [2]:
df_train=pd.read_csv('train.csv')
X_test=pd.read_csv('test.csv')

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3487 entries, 0 to 3486
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   gender_Female           3487 non-null   int64  
 1   gender_Male             3487 non-null   int64  
 2   job_Argiculture         3487 non-null   int64  
 3   job_Entertainment       3487 non-null   int64  
 4   job_Financial Services  3487 non-null   int64  
 5   job_Health              3487 non-null   int64  
 6   job_IT                  3487 non-null   int64  
 7   job_Manufacturing       3487 non-null   int64  
 8   job_Property            3487 non-null   int64  
 9   job_Retail              3487 non-null   int64  
 10  car_No                  3487 non-null   int64  
 11  state_NSW               3487 non-null   int64  
 12  state_QLD               3487 non-null   int64  
 13  age                     3487 non-null   int64  
 14  wealth_segment          3487 non-null   

In [4]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   gender_Female           1000 non-null   int64
 1   gender_Male             1000 non-null   int64
 2   job_Argiculture         1000 non-null   int64
 3   job_Entertainment       1000 non-null   int64
 4   job_Financial Services  1000 non-null   int64
 5   job_Health              1000 non-null   int64
 6   job_IT                  1000 non-null   int64
 7   job_Manufacturing       1000 non-null   int64
 8   job_Property            1000 non-null   int64
 9   job_Retail              1000 non-null   int64
 10  car_No                  1000 non-null   int64
 11  state_NSW               1000 non-null   int64
 12  state_QLD               1000 non-null   int64
 13  age                     1000 non-null   int64
 14  wealth_segment          1000 non-null   int64
 15  tenure                

In [5]:
df_train['tenure']=df_train['tenure'].astype('int64')

In [6]:
X_train=df_train.iloc[:,:-1]
y_train=df_train.iloc[:,-1]

# Model Training

In [7]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [8]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}


In [9]:
rf=RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', 
                               n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)
rf_random.fit(X_train,y_train)

rf_random.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   1.7s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s


[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   1.9s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 
[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   1.8s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 
[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   1.7s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 
[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   1.8s
[CV] n_estimators=1100, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=15 
[CV]  n_estimators=1100, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=15, total=   2.6s
[CV] n_estimators=1100, min_samples_split=10, mi

[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   1.1s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   1.0s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   0.9s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   0.9s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   0.9s
[CV] n_estimators=700, min_samples_split=15, min_sam

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.9min finished


{'n_estimators': 1100,
 'min_samples_split': 15,
 'min_samples_leaf': 10,
 'max_features': 'sqrt',
 'max_depth': 5}

In [10]:
# open a file, where you ant to store the data
file = open('random_forest_regression_model.pkl', 'wb')

# dump information to that file
pickle.dump(rf_random, file)

# Prediction

In [11]:
predictions=rf_random.predict(X_test)
X_test['profit']=predictions
X_test=X_test.sort_values(by='profit', ascending=False)

In [40]:
file='C:/Users/Amit/Desktop/Internships/KPMG/dataset.xlsx'
df= pd.read_excel(file, sheet_name="NewCustomerList",header=1)[['first_name','last_name','DOB','state']]
df=df.loc[X_test.index]
df['profit']=X_test['profit']
df=df.reset_index(drop=True)

In [41]:
df.loc[0:4]

Unnamed: 0,first_name,last_name,DOB,state,profit
0,Aundrea,Outridge,2001-01-24,VIC,602.353445
1,Tomaso,Horsley,2001-04-16,VIC,602.117926
2,Tanya,Hamberston,2000-09-25,QLD,601.096094
3,Ailyn,Howgate,2001-09-27,QLD,598.239882
4,Kelcie,Kingaby,2000-03-24,QLD,595.505647


In [42]:
df.to_csv('final.csv',index=False)