In [1]:
!pip install pydataset



In [2]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
import numpy as np
from pydataset import data
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [3]:
#Load dataset
df= pd.read_csv('C:/Users/TechVannah/Downloads/Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
#Specify Columns used as features
feature_cols=['Tenure','CreditScore','Balance','NumOfProducts', 'HasCrCard','IsActiveMember','EstimatedSalary','Exited']
# Create feature matrix X and target vector y
X = df[feature_cols]
y = df['Exited']

In [5]:
#Determine the baseline model by comparing different models
crossvalidation=KFold(n_splits=10,shuffle=True,random_state=1)
for depth in range (1,10):
    tree_classifier=tree.DecisionTreeClassifier(max_depth=depth,random_state=1)
    if tree_classifier.fit(X,y).tree_.max_depth<depth:
        break
    score=np.mean(cross_val_score(tree_classifier,X,y,scoring='accuracy',cv=crossvalidation,n_jobs=1))
    print(depth,score)

1 1.0


In [6]:
#Create instance of the gradient boosting classifier
GBC=GradientBoostingClassifier()
#Create a grid for the search
search_grid={'n_estimators' :[500,1000,2000], 'learning_rate':[.001,0.01,.1],
             'max_depth':[1,3,5],'subsample':[.5,.75,1],'random_state':[1]}
search=GridSearchCV(estimator=GBC,param_grid=search_grid,scoring='accuracy',
                    n_jobs=1,cv=crossvalidation)

In [7]:
search

GridSearchCV(cv=KFold(n_splits=10, random_state=1, shuffle=True),
             estimator=GradientBoostingClassifier(), n_jobs=1,
             param_grid={'learning_rate': [0.001, 0.01, 0.1],
                         'max_depth': [1, 3, 5],
                         'n_estimators': [500, 1000, 2000], 'random_state': [1],
                         'subsample': [0.5, 0.75, 1]},
             scoring='accuracy')

In [None]:
search.fit(X,y)
print(search.best_params_)
print(search.best_score_)

In [None]:
#Gradient Boosting Model
ada2=GradientBoostingClassifier(n_estimators=2000,learning_rate=0.01,
                               subsample=.75,max_depth=5,random_state=1)
score=np.mean(cross_val_score(ada2,X,y,scoring='accuracy',cv=crossvalidation,n_jobs=1))
score

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# generate a random dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, random_state=1)

# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# create a gradient boosting classifier
boosting = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=1)

# train the boosting classifier on the train set
boosting.fit(X_train, y_train)

# make predictions on the test set
y_pred = boosting.predict(X_test)

# calculate the accuracy of the boosting classifier
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy: %.2f%%" % (accuracy * 100.0))