### Attrition Rate - Model BUilding - Tree Models

1. Adaboost!!! Both regression and classification problems
2. GBM

In [81]:
pip install xgboost

Collecting xgboostNote: you may need to restart the kernel to use updated packages.

  Downloading xgboost-1.6.2-py3-none-win_amd64.whl.metadata (1.8 kB)
Downloading xgboost-1.6.2-py3-none-win_amd64.whl (125.4 MB)
   ---------------------------------------- 125.4/125.4 MB 8.5 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2


In [82]:
import numpy as np
import time
# import Ipython
# from ipynb.display import display
import pandas as pd
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as pp
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier,GradientBoostingRegressor)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, log_loss,confusion_matrix, mean_squared_error,explained_variance_score,r2_score)

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [3]:
## Importing dataset
df=pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head(4)
df.dtypes
print(df.Attrition.value_counts())

No     1233
Yes     237
Name: Attrition, dtype: int64


In [4]:
### Let's count the number of dtypes ###
df_int=pd.DataFrame(data=df.select_dtypes(include=['int64']))
df_obj=pd.DataFrame(data=df.select_dtypes(include=['object']))
print(f'numerical data :{df_int.shape[1]}')
print(f'categorical data :{df_obj.shape[1]}')


numerical data :26
categorical data :9


In [5]:
### Encoding the categorical data ###
df_obj=df_obj.drop('Attrition',axis=1)
df_obj=pd.get_dummies(df_obj,drop_first=True)
df_obj.shape
### Label Encoder ###
# for i in d

(1470, 21)

In [6]:
### Standardizing the data 
std_scaler=StandardScaler()

df_int_scaled=std_scaler.fit_transform(df_int)
df_num_scaled=pd.DataFrame(data=df_int_scaled,columns=df_int.columns,index=df_int.index)
df_num_scaled

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,0.446350,0.742527,-1.010909,-0.891688,0.0,-1.701283,-0.660531,1.383138,0.379672,-0.057788,...,-1.584178,0.0,-0.932014,-0.421642,-2.171982,-2.493820,-0.164613,-0.063296,-0.679146,0.245834
1,1.322365,-1.297775,-0.147150,-1.868426,0.0,-1.699621,0.254625,-0.240677,-1.026167,-0.057788,...,1.191438,0.0,0.241988,-0.164511,0.155707,0.338096,0.488508,0.764998,-0.368715,0.806541
2,0.008343,1.414363,-0.887515,-0.891688,0.0,-1.696298,1.169781,1.284725,-1.026167,-0.961486,...,-0.658973,0.0,-0.932014,-0.550208,0.155707,0.338096,-1.144294,-1.167687,-0.679146,-1.155935
3,-0.429664,1.461466,-0.764121,1.061787,0.0,-1.694636,1.169781,-0.486709,0.379672,-0.961486,...,0.266233,0.0,-0.932014,-0.421642,0.155707,0.338096,0.161947,0.764998,0.252146,-1.155935
4,-1.086676,-0.524295,-0.887515,-1.868426,0.0,-1.691313,-1.575686,-1.274014,0.379672,-0.961486,...,1.191438,0.0,0.241988,-0.678774,0.155707,0.338096,-0.817734,-0.615492,-0.058285,-0.595227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,-0.101159,0.202082,1.703764,-0.891688,0.0,1.721670,0.254625,-1.224807,1.785511,-0.057788,...,0.266233,0.0,0.241988,0.735447,0.155707,0.338096,-0.327893,-0.615492,-0.679146,-0.314873
1466,0.227347,-0.469754,-0.393938,-1.868426,0.0,1.723332,1.169781,-1.175601,-1.026167,0.845911,...,-1.584178,0.0,0.241988,-0.293077,1.707500,0.338096,-0.001333,0.764998,-0.368715,0.806541
1467,-1.086676,-1.605183,-0.640727,0.085049,0.0,1.726655,-0.660531,1.038693,1.785511,-0.057788,...,-0.658973,0.0,0.241988,-0.678774,-2.171982,0.338096,-0.164613,-0.615492,-0.679146,-0.314873
1468,1.322365,0.546677,-0.887515,0.085049,0.0,1.728317,1.169781,-0.142264,-1.026167,-0.057788,...,1.191438,0.0,-0.932014,0.735447,0.155707,-1.077862,0.325228,0.488900,-0.679146,1.086895


In [7]:
### Preparing for train and test split

map={'Yes':1,'No':0}

targets=df['Attrition'].apply(lambda x: map[x])
print(targets.value_counts())

### train test split ###
df_indep=pd.concat([df_num_scaled,df_obj],axis=1)

0    1233
1     237
Name: Attrition, dtype: int64


In [8]:
X_train,X_test,y_train,y_test=train_test_split(df_indep,targets,random_state=54,test_size=0.2)
print(f'''
Target rate: {100*np.mean(targets):.2f}%
Train split (X & Y): {X_train.shape} & {y_train.shape} & target_prop: {100*(np.mean(y_train)):.2f}%
Test split (X & Y): {X_test.shape} & {y_test.shape} & target_prop: {100*(np.mean(y_test)):.2f}%

''')


Target rate: 16.12%
Train split (X & Y): (1176, 47) & (1176,) & target_prop: 15.99%
Test split (X & Y): (294, 47) & (294,) & target_prop: 16.67%




Build a adaboost from scratch

In [9]:
class AdaBoost:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators
        self.alphas = []  # Weights for each weak classifier
        self.models = []  # Store weak classifiers

    def fit(self, X, y):
        """ Train AdaBoost model """
        n_samples, n_features = X.shape
        w = np.ones(n_samples) / n_samples  # Initialize weights equally

        for _ in range(self.n_estimators):
            model = DecisionTreeClassifier(max_depth=1)  # Weak learner (stump)
            model.fit(X, y, sample_weight=w)
            y_pred = model.predict(X)

            # Compute weighted error
            err = np.sum(w * (y_pred != y)) / np.sum(w)

            # Compute model weight (alpha)
            alpha = 0.5 * np.log((1 - err) / (err + 1e-10))  # Avoid division by zero
            self.alphas.append(alpha)
            self.models.append(model)

            # Update sample weights
            w *= np.exp(-alpha * y * y_pred)
            w /= np.sum(w)  # Normalize weights

    def predict(self, X):
        """ Predict using the AdaBoost model """
        final_pred = np.zeros(X.shape[0])

        for alpha, model in zip(self.alphas, self.models):
            final_pred += alpha * model.predict(X)  # Weighted sum of weak classifiers

        return np.sign(final_pred)  # Return final classification (-1 or +1)

In [10]:
adaboost=AdaBoostClassifier(n_estimators=100,learning_rate=1,random_state=42)
adaboost_udf=AdaBoost(n_estimators=100)

In [11]:
### Fitting the model
adaboost.fit(X_train,y_train)
adaboost_udf.fit(X_train,y_train)

In [12]:
### Predicting model ###
y_pred_test=adaboost.predict(X_test)
y_pred_train=adaboost.predict(X_train)
len(adaboost.predict(X_test))
print(f'Train confusion matrix & accuracy:{100*accuracy_score(y_train,y_pred_train):.2f}%')
print(confusion_matrix(y_train,y_pred_train))
print(f'Test confusion matrix & accuracy:{100*accuracy_score(y_test,y_pred_test):.2f}%')
print(confusion_matrix(y_test,y_pred_test))

Train confusion matrix & accuracy:92.01%
[[966  22]
 [ 72 116]]
Test confusion matrix & accuracy:87.07%
[[234  11]
 [ 27  22]]


In [13]:
udf_y_pred=adaboost_udf.predict(X_test)
confusion_matrix(y_test,udf_y_pred)

array([[245,   0],
       [ 49,   0]], dtype=int64)

### Regression Problem

Predicting the price of the house from the given data

In [14]:
### Importing dataset
reg_df=pd.read_csv('kc_house_data.csv')
reg_df.shape

(21613, 21)

In [15]:
reg_df.isnull().sum()
reg_df.dtypes
reg_df.drop(['id','date'],axis=1,inplace=True)

In [16]:
reg_df.columns
reg_df_num=reg_df.select_dtypes(include=['int64','float64'])
reg_df_cat=reg_df.select_dtypes(include=['object'])

### Removing target variables
reg_df_num.drop('price',axis=1,inplace=True)
reg_df_num.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [17]:
targets=reg_df.price

In [18]:
#### Train test split 
X_train, X_test, y_train, y_test=train_test_split(reg_df_num,targets,test_size=0.20,random_state=43)
print(f'''
Train split (X & Y): {X_train.shape} & {y_train.shape}
Test split (X & Y): {X_test.shape} & {y_test.shape}
''')


Train split (X & Y): (17290, 18) & (17290,)
Test split (X & Y): (4323, 18) & (4323,)



GradientBoostingRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html#

In [19]:
gbm_reg=GradientBoostingRegressor()

In [28]:
### Fitting ###
start = time.time()
gbm_reg.fit(X_train,y_train)
### predict ###
predict_reg_train=gbm_reg.predict(X_train)
predict_reg_test=gbm_reg.predict(X_test)
end = time.time()

print(f"Execution time: {end - start:.6f} seconds")

Execution time: 2.928258 seconds


In [33]:
print(f'Training r-squared value:{100*r2_score(y_train,predict_reg_train):.3f}%')
print(f'Test r-squared value:{100*r2_score(y_test,predict_reg_test):.3f}%')
print(f'Root Mean squared error - Train: {np.sqrt(mean_squared_error(y_train,predict_reg_train))}')
print(f'Root Mean squared error - Test: {np.sqrt(mean_squared_error(y_test,predict_reg_test))}')

Training r-squared value:90.052%
Test r-squared value:87.484%
Root Mean squared error - Train: 114990.00824815263
Root Mean squared error - Test: 133392.15413760426


In [83]:
params_grid ={
    'loss':['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate':[0.01,0.1,1],
    'max_depth':[2,3,5],
    'n_estimators':[50,100,500]
}

In [None]:
grid_search=GridSearchCV(estimator=gbm_reg,param_grid=params_grid,scoring='accuracy',cv=3,n_jobs=1,return_train_score=True)
grid_search.fit(X_train,y_train)

In [68]:
grid_df=pd.DataFrame(grid_search.predict(X_test))
grid_df

Unnamed: 0,0
0,548171.222577
1,512162.256063
2,577809.272141
3,908335.988585
4,455162.832380
...,...
4318,455162.832380
4319,476050.080448
4320,521295.057527
4321,540038.512164


In [73]:
grid_search.best_params_

{'learning_rate': 0.01,
 'loss': 'squared_error',
 'max_depth': 2,
 'n_estimators': 50}

In [78]:
best_gbm=GradientBoostingRegressor(learning_rate= 0.01,
 loss= 'squared_error',
 max_depth= 2,
 n_estimators= 1000)
best_gbm.fit(X_train,y_train)

GradientBoostingRegressor(learning_rate=0.01, max_depth=2, n_estimators=1000)

In [79]:
### predict ###
predict_best_train=best_gbm.predict(X_train)
predict_best_test=best_gbm.predict(X_test)
print(f'Training r-squared value:{100*r2_score(y_train,predict_best_train):.3f}%')
print(f'Test r-squared value:{100*r2_score(y_test,predict_best_test):.3f}%')
print(f'Root Mean squared error - Train: {np.sqrt(mean_squared_error(y_train,predict_best_train))}')
print(f'Root Mean squared error - Test: {np.sqrt(mean_squared_error(y_test,predict_best_test))}')

Training r-squared value:85.820%
Test r-squared value:83.380%
Root Mean squared error - Train: 137291.6265761667
Root Mean squared error - Test: 153713.4501132763
