In [118]:
import pandas as pd 
import numpy as np                     # For mathematical calculations 
import seaborn as sns                  # For data visualization 
import matplotlib.pyplot as plt        # For plotting graphs 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression 
%matplotlib inline 
import warnings         

In [63]:
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

In [64]:
train_original=train.copy()
test_original=test.copy()

In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split


In [66]:
# Separate target from predictors
y = train.Attrition_rate
X = train.drop(['Attrition_rate'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]


# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [67]:
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [68]:
from sklearn.metrics import mean_squared_error

from math import sqrt
def score(y_actual,y_predicted):
    rmse = sqrt(mean_squared_error(y_actual, y_predicted))
    score=100*max(0,1-rmse)
    return score

In [69]:
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)
print(score(y_valid,preds))

80.70555910567307


In [70]:
pipe_lr_pca = Pipeline([('preprocessor',preprocessor),
                        ('scl', StandardScaler()),
                        ('pca', PCA(0.85,whiten=True)),
                       ('rgr',LinearRegression())])
pipe_lr_pca.fit(X_train,y_train)
predso = pipe_lr_pca.predict(X_valid)
print(score(y_valid,predso))

81.12239682758619


In [71]:
pipe_lr_pca.predict(test)

array([0.20042359, 0.23076392, 0.19535565, ..., 0.18496393, 0.16635356,
       0.19245593])

In [95]:
param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range_fl = [1.0, 0.5, 0.1]
grid_params_rf = [{
                   'clf__min_samples_leaf': param_range,
                   'clf__max_depth': param_range,
                   'clf__min_samples_split': param_range[1:]}]

In [98]:
gs_rf_pca = GridSearchCV(estimator=pipe_rf_pca,
                         param_grid=grid_params_rf,
                    
                         cv=10, 
                         n_jobs=-1)

In [99]:
gs_rf_pca.fit(X_train,y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         SimpleImputer(add_indicator=False,
                                                                                       copy=True,
                                                                                       fill_value=None,
                                                                                       missing_values=nan,
                                      

In [100]:
predref=gs_rf_pca.predict(X_valid)
print(score(y_valid,predref))

81.21214272890327


In [101]:
predict_ref=gs_rf_pca.predict(test)

In [102]:
predict_ref

array([0.17686081, 0.20215643, 0.19132994, ..., 0.1800058 , 0.1755919 ,
       0.1800058 ])

In [103]:
test_originals=test_original.copy()

In [104]:
test_originals['Attrition_rate']=predict_ref

In [105]:
pd.DataFrame(test_originals, columns=['Employee_ID','Attrition_rate']).to_csv('C:/Users/Dell/Downloads/PipeRf.csv',mode = 'w', index=False)

In [125]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
# Provide range for max_depth from 1 to 20 with an interval of 2 and from 1 to 200 with an interval of 20 for n_estimators 
param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
pipe_xgb_pca = Pipeline([('preprocessor',preprocessor),
                        ('scl', StandardScaler()),
                        ('pca', PCA(n_components=2)),
                        ('clf', XGBRegressor(random_state=42))])
paramgrid = {'max_depth': list(range(1,10,1)), 'n_estimators': list(range(1, 200, 20))}
params = {
    "clf__n_estimators": [25, 50, 100, 150, 200, 300],
    "clf__learning_rate": [0.5, 0.75, 1]
}

In [126]:
gs_xgb_pca = GridSearchCV(estimator=pipe_xgb_pca,
                         param_grid=params,
                    
                         cv=10, 
                         n_jobs=-1)

In [127]:
gs_xgb_pca.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         SimpleImputer(add_indicator=False,
                                                                                       copy=True,
                                                                                       fill_value=None,
                                                                                       missing_values=nan,
                                      

In [128]:
import sklearn.model_selection as ms
from sklearn.linear_model import Ridge
import math
pipe_ridge_pca = Pipeline([('preprocessor',preprocessor),
                        ('scl', StandardScaler()),
                        ('pca', PCA(n_components=2)),
                        ('clf', Ridge(random_state=42))])

parameters= {'clf__alpha':[x for x in range(1,101)]}

In [129]:
gs_ridge_pca = GridSearchCV(estimator=pipe_ridge_pca,
                         param_grid=parameters,
                    
                         cv=10, 
                         n_jobs=-1)

In [130]:
gs_ridge_pca

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         SimpleImputer(add_indicator=False,
                                                                                       copy=True,
                                                                                       fill_value=None,
                                                                                       missing_values=nan,
                                      

In [143]:
grids = [gs_xgb_pca,gs_ridge_pca,gs_lasso_pca]
grid_dict = {0: 'Gradient Boosting Regression', 1: 'Ridge Regression ',2: 'LassoRegression'}
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])
    gs.fit(X_train, y_train)
    print('Best params: %s' % gs.best_params_)
    print('Best training accuracy: %.3f' % gs.best_score_)
    y_pred = gs.predict(X_valid)
    print('Test set accuracy score for best params: %.3f ' % score(y_valid, y_pred))
    if score(y_valid, y_pred) > best_acc:
        best_acc = score(y_valid, y_pred)
        best_gs = gs
        best_clf = idx
        print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])

Performing model optimizations...

Estimator: Gradient Boosting Regression
Best params: {'clf__learning_rate': 0.5, 'clf__n_estimators': 25}
Best training accuracy: -0.098
Test set accuracy score for best params: 80.567 

Classifier with best test set accuracy: Gradient Boosting Regression

Estimator: Ridge Regression 
Best params: {'clf__alpha': 32}
Best training accuracy: -0.004
Test set accuracy score for best params: 81.245 

Classifier with best test set accuracy: Ridge Regression 

Estimator: LassoRegression
Best params: {'clf__alpha': 0.1}
Best training accuracy: -0.004
Test set accuracy score for best params: 81.258 

Classifier with best test set accuracy: LassoRegression


In [133]:
predicto=gs_ridge_pca.predict(test)

In [134]:
predicto

array([0.18754728, 0.20238909, 0.19544588, ..., 0.19101398, 0.18347629,
       0.18954554])

In [135]:
test_originalees=test_original.copy()

In [136]:
test_originalees['Attrition_rate']=predicto

In [137]:
pd.DataFrame(test_originalees, columns=['Employee_ID','Attrition_rate']).to_csv('C:/Users/Dell/Downloads/PipeRidge.csv',mode = 'w', index=False)

In [139]:
from sklearn.linear_model import Lasso
import sklearn.model_selection as ms
import math

pipe_lasso_pca = Pipeline([('preprocessor',preprocessor),
                        ('scl', StandardScaler()),
                        ('pca', PCA(n_components=2)),
                      ('clf', Lasso(random_state=42))])
parameters= {'clf__alpha':[0.0001,0.0009,0.001,0.002,0.003,0.01,0.1,1,10,100]}


In [140]:
gs_lasso_pca = GridSearchCV(estimator=pipe_lasso_pca,
                         param_grid=parameters,
                    
                         cv=10, 
                         n_jobs=-1)

In [141]:
gs_lasso_pca

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         SimpleImputer(add_indicator=False,
                                                                                       copy=True,
                                                                                       fill_value=None,
                                                                                       missing_values=nan,
                                      

In [144]:
test_originalox=test_original.copy()

In [145]:
peditox=gs_lasso_pca.predict(test)

In [146]:
test_originalox['Attrition_rate']=peditox

In [147]:
pd.DataFrame(test_originalox, columns=['Employee_ID','Attrition_rate']).to_csv('C:/Users/Dell/Downloads/PipeLasso.csv',mode = 'w', index=False)

In [197]:
from sklearn.linear_model import ElasticNetCV,ElasticNet

alpha = [1.0]
alphas=[0.00010491]
l1ratio = [0.1, .5, .7, .9, .95, .99, 1]
ratio=[0.100]
pipe_elastic_pca = Pipeline([('preprocessor',preprocessor),
                        ('scl', StandardScaler()),
                        ('pca', PCA(n_components=2)),
                        ('clf', ElasticNet(random_state=42))])
params={'clf__alpha':alpha,'clf__l1ratio':l1ratio}
n_components = list(range(1,X.shape[1]+1,1))

normalize = [True, False]
selection = ['cyclic', 'random']

    # Create a dictionary of all the parameter options 
    # Note has you can access the parameters of steps of a pipeline by using '__’
parameters = dict(    clf__alpha=alphas,clf__l1_ratio=ratio,
                      clf__normalize=normalize,
                      clf__selection=selection)


In [198]:
gs_elastic_pca=GridSearchCV(pipe_elastic_pca,
                         param_grid=parameters,
                    
                         cv=10, 
                         n_jobs=-1)

In [199]:
gs_elastic_pca

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         SimpleImputer(add_indicator=False,
                                                                                       copy=True,
                                                                                       fill_value=None,
                                                                                       missing_values=nan,
                                      

In [200]:
gs_elastic_pca.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         SimpleImputer(add_indicator=False,
                                                                                       copy=True,
                                                                                       fill_value=None,
                                                                                       missing_values=nan,
                                      

In [165]:
ElasticNet().get_params().keys()

dict_keys(['alpha', 'copy_X', 'fit_intercept', 'l1_ratio', 'max_iter', 'normalize', 'positive', 'precompute', 'random_state', 'selection', 'tol', 'warm_start'])

In [191]:
predx=gs_elastic_pca.predict(test)

In [201]:
prodi=gs_elastic_pca.predict(X_valid)

In [202]:
print(score(y_valid,prodi))

81.25288044498407


In [194]:
test_originalexs=test_original.copy()

In [195]:
test_originalexs['Attrition_rate']=predx

In [196]:
pd.DataFrame(test_originalexs, columns=['Employee_ID','Attrition_rate']).to_csv('C:/Users/Dell/Downloads/PipeElastic.csv',mode = 'w', index=False)

In [204]:
import numpy as np

final1=np.expm1(predx)
#StackingRegressor to predict the final Test

final2=np.expm1(peditox)

#LassoRegressor to predict the final Test

final3=np.expm1(predicto)

In [234]:
#0.4->0.5
final=(0.3*final1+0.4*final2+0.2*final3)

In [235]:
final

array([0.18680555, 0.19041295, 0.18871869, ..., 0.18764337, 0.18582539,
       0.18728813])

In [236]:
test_oxlr=test_original.copy()

In [237]:
test_oxlr['Attrition_rate']=final

In [221]:
final.size

3000

In [222]:
final1.size

3000

In [224]:
test.shape

(3000, 23)

In [238]:
pd.DataFrame(test_oxlr, columns=['Employee_ID','Attrition_rate']).to_csv('C:/Users/Dell/Downloads/PipeFinal39.csv',mode = 'w', index=False)