### Attrition Rate - Model BUilding 

Adaboost!!! Both regression and classification problems

In [89]:
import numpy as np
# from ipynb.display import display
import pandas as pd
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as pp
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, log_loss,confusion_matrix, mean_squared_error,explained_variance_score,r2_score)

import warnings
warnings.filterwarnings('ignore')

In [44]:
## Importing dataset
df=pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head(4)
df.dtypes
print(df.Attrition.value_counts())

No     1233
Yes     237
Name: Attrition, dtype: int64


In [45]:
### Let's count the number of dtypes ###
df_int=pd.DataFrame(data=df.select_dtypes(include=['int64']))
df_obj=pd.DataFrame(data=df.select_dtypes(include=['object']))
print(f'numerical data :{df_int.shape[1]}')
print(f'categorical data :{df_obj.shape[1]}')


numerical data :26
categorical data :9


In [46]:
### Encoding the categorical data
df_obj=df_obj.drop('Attrition',axis=1)
df_obj=pd.get_dummies(df_obj,drop_first=True)
df_obj.shape

(1470, 21)

In [47]:
### Standardizing the data 
std_scaler=StandardScaler()

df_int_scaled=std_scaler.fit_transform(df_int)
df_num_scaled=pd.DataFrame(data=df_int_scaled,columns=df_int.columns,index=df_int.index)
df_num_scaled

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,0.446350,0.742527,-1.010909,-0.891688,0.0,-1.701283,-0.660531,1.383138,0.379672,-0.057788,...,-1.584178,0.0,-0.932014,-0.421642,-2.171982,-2.493820,-0.164613,-0.063296,-0.679146,0.245834
1,1.322365,-1.297775,-0.147150,-1.868426,0.0,-1.699621,0.254625,-0.240677,-1.026167,-0.057788,...,1.191438,0.0,0.241988,-0.164511,0.155707,0.338096,0.488508,0.764998,-0.368715,0.806541
2,0.008343,1.414363,-0.887515,-0.891688,0.0,-1.696298,1.169781,1.284725,-1.026167,-0.961486,...,-0.658973,0.0,-0.932014,-0.550208,0.155707,0.338096,-1.144294,-1.167687,-0.679146,-1.155935
3,-0.429664,1.461466,-0.764121,1.061787,0.0,-1.694636,1.169781,-0.486709,0.379672,-0.961486,...,0.266233,0.0,-0.932014,-0.421642,0.155707,0.338096,0.161947,0.764998,0.252146,-1.155935
4,-1.086676,-0.524295,-0.887515,-1.868426,0.0,-1.691313,-1.575686,-1.274014,0.379672,-0.961486,...,1.191438,0.0,0.241988,-0.678774,0.155707,0.338096,-0.817734,-0.615492,-0.058285,-0.595227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,-0.101159,0.202082,1.703764,-0.891688,0.0,1.721670,0.254625,-1.224807,1.785511,-0.057788,...,0.266233,0.0,0.241988,0.735447,0.155707,0.338096,-0.327893,-0.615492,-0.679146,-0.314873
1466,0.227347,-0.469754,-0.393938,-1.868426,0.0,1.723332,1.169781,-1.175601,-1.026167,0.845911,...,-1.584178,0.0,0.241988,-0.293077,1.707500,0.338096,-0.001333,0.764998,-0.368715,0.806541
1467,-1.086676,-1.605183,-0.640727,0.085049,0.0,1.726655,-0.660531,1.038693,1.785511,-0.057788,...,-0.658973,0.0,0.241988,-0.678774,-2.171982,0.338096,-0.164613,-0.615492,-0.679146,-0.314873
1468,1.322365,0.546677,-0.887515,0.085049,0.0,1.728317,1.169781,-0.142264,-1.026167,-0.057788,...,1.191438,0.0,-0.932014,0.735447,0.155707,-1.077862,0.325228,0.488900,-0.679146,1.086895


In [60]:
### Preparing for train and test split

map={'Yes':1,'No':0}

targets=df['Attrition'].apply(lambda x: map[x])
print(targets.value_counts())

### train test split ###
df_indep=pd.concat([df_num_scaled,df_obj],axis=1)

0    1233
1     237
Name: Attrition, dtype: int64


In [66]:
X_train,X_test,y_train,y_test=train_test_split(df_indep,targets,random_state=54,test_size=0.2)
print(f'''
Target rate: {100*np.mean(targets):.2f}%
Train split (X & Y): {X_train.shape} & {y_train.shape} & target_prop: {100*(np.mean(y_train)):.2f}%
Test split (X & Y): {X_test.shape} & {y_test.shape} & target_prop: {100*(np.mean(y_test)):.2f}%

''')


Target rate: 16.12%
Train split (X & Y): (1176, 47) & (1176,) & target_prop: 15.99%
Test split (X & Y): (294, 47) & (294,) & target_prop: 16.67%




Build a adaboost from scratch

In [41]:
class AdaBoost:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators
        self.alphas = []  # Weights for each weak classifier
        self.models = []  # Store weak classifiers

    def fit(self, X, y):
        """ Train AdaBoost model """
        n_samples, n_features = X.shape
        w = np.ones(n_samples) / n_samples  # Initialize weights equally

        for _ in range(self.n_estimators):
            model = DecisionTreeClassifier(max_depth=1)  # Weak learner (stump)
            model.fit(X, y, sample_weight=w)
            y_pred = model.predict(X)

            # Compute weighted error
            err = np.sum(w * (y_pred != y)) / np.sum(w)

            # Compute model weight (alpha)
            alpha = 0.5 * np.log((1 - err) / (err + 1e-10))  # Avoid division by zero
            self.alphas.append(alpha)
            self.models.append(model)

            # Update sample weights
            w *= np.exp(-alpha * y * y_pred)
            w /= np.sum(w)  # Normalize weights

    def predict(self, X):
        """ Predict using the AdaBoost model """
        final_pred = np.zeros(X.shape[0])

        for alpha, model in zip(self.alphas, self.models):
            final_pred += alpha * model.predict(X)  # Weighted sum of weak classifiers

        return np.sign(final_pred)  # Return final classification (-1 or +1)

In [70]:
adaboost=AdaBoostClassifier(n_estimators=100,learning_rate=1,random_state=42)
adaboost_udf=AdaBoost(n_estimators=100)

In [72]:
### Fitting the model
adaboost.fit(X_train,y_train)
adaboost_udf.fit(X_train,y_train)

In [83]:
### Predicting model ###
y_pred_test=adaboost.predict(X_test)
y_pred_train=adaboost.predict(X_train)
len(adaboost.predict(X_test))
print(f'Train confusion matrix & accuracy:{100*accuracy_score(y_train,y_pred_train):.2f}%')
print(confusion_matrix(y_train,y_pred_train))
print(f'Test confusion matrix & accuracy:{100*accuracy_score(y_test,y_pred_test):.2f}%')
print(confusion_matrix(y_test,y_pred_test))

Train confusion matrix & accuracy:92.01%
[[966  22]
 [ 72 116]]
Test confusion matrix & accuracy:87.07%
[[234  11]
 [ 27  22]]


In [86]:
udf_y_pred=adaboost_udf.predict(X_test)
confusion_matrix(y_test,udf_y_pred)

array([[245,   0],
       [ 49,   0]], dtype=int64)

### AdaBoost Regression Problem

Predicting the price of the house from the given data

In [94]:
### Importing dataset
reg_df=pd.read_csv('kc_house_data.csv')
reg_df.shape

(21613, 21)

In [93]:
reg_df.isnull().sum()
reg_df.dtypes

id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

In [97]:
reg_df.select_dtypes(include=['float64','int64'])

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,221900.0,3,1.00,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,180000.0,2,1.00,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,604000.0,4,3.00,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,510000.0,3,2.00,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,360000.0,3,2.50,1530,1131,3.0,0,0,3,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,400000.0,4,2.50,2310,5813,2.0,0,0,3,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,402101.0,2,0.75,1020,1350,2.0,0,0,3,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,400000.0,3,2.50,1600,2388,2.0,0,0,3,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287
