# Hyper-Parameters Tunning (Data Scientist Job Change Prediction)

In [14]:
from pprint import pprint
from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_score,accuracy_score,f1_score,recall_score,roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [3]:
model = RandomForestClassifier(random_state=42)

<IPython.core.display.Javascript object>

In [4]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

# Random Hyperparameter Grid`

In [5]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}


In [None]:
data=pd.read_csv('Converted_HR_job_2',index_col=[0])


In [7]:
X=data.drop(['target'],axis=1).values
y=data['target'].values

In [8]:
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.3,random_state=42)

<IPython.core.display.Javascript object>

# Random Search Training

In [9]:
rf = RandomForestClassifier(random_state=42)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 20,
                               cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Fitting 3 folds for each of 20 candidates, totalling 60 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(random_state=42),
                   n_iter=20, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000]},
                   random_state=42, verbose=2)

In [10]:
rf_random.best_params_

{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 100,
 'bootstrap': True}

In [12]:
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    Accuracy = accuracy_score(y_test, y_pred)
    Precision = precision_score(y_test, y_pred)
    Recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    ROC = roc_auc_score(y_test, y_pred)
    
    
    print('Model Performance')
    print('Accuracy: {:0.3f}'.format(Accuracy*100))
    print('Precision: {:0.3f}'.format(Precision*100))
    print('Recall: {:0.3f}'.format(Recall*100))
    print('F1: {:0.3f}'.format(F1*100))
    print('ROC: {:0.3f}'.format(ROC*100))
   
    
    return [accuracy_score, precision_score,recall_score,f1_score ,roc_auc_score]

# Evaluate Random Search

### baseline score

In [15]:
base_model = RandomForestClassifier(random_state=42)
base_model.fit(X_train, y_train)
base_score = evaluate(base_model, X_test, y_test)

<IPython.core.display.Javascript object>

Model Performance
Accuracy: 76.026
Precision: 53.543
Recall: 36.533
F1: 43.432
ROC: 62.929


### best model score

In [16]:
best_random = rf_random.best_estimator_
random_score = evaluate(best_random, X_test, y_test)

Model Performance
Accuracy: 77.262
Precision: 58.150
Recall: 34.738
F1: 43.493
ROC: 63.159


### Improvement

In [None]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_score[0] - base_score[0]) / base_score[0]))

In [18]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100,110,120,130],
    'max_features': ['auto'],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [1, 2, 3, 4, 5, 6],
    'n_estimators': [400, 500,600,700,800]
}


# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 900 candidates, totalling 2700 fits


# Grid Search with Cross Validation

### Improvement

# Conclusions:

- Data is Highly imbalanced.
- There is High Cardinality in some categorical data.
- Data is full of nulls that reach 30% in some columns.
- Handling outliers is not a good choice in this data.
- The best way for filling the nulls is by using mode of data.
- The correlation between independent variables and dependent variable is low.'
- The best model is Random Forest.
- More than 50% candidates will not change the job.