In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
import sklearn.model_selection as sklms
import sklearn.neighbors as sklnb
import matplotlib.pyplot as plt

import sklearn.datasets
import sklearn.linear_model
import random
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


# evaluate gradient boosting algorithm for classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
#create confusion matrix
def performace(y_test,y_pred):
    m1 = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print("confusionmatrix:")
    print( m1 )
    print(f"accuracy:{acc}")
    print(f"precision:{pre}")
    print(f"recall:{rec}")
    print(f"F1-score:{f1}")

In [3]:
# read the data
df = pd.read_csv('C:/Users/36055/OneDrive/桌面/SML Slides/train.csv')
df.head()

Unnamed: 0,Number words female,Total words,Number of words lead,Difference in words lead and co-lead,Number of male actors,Year,Number of female actors,Number words male,Gross,Mean Age Male,Mean Age Female,Age Lead,Age Co-Lead,Lead
0,1512,6394,2251.0,343,2,1995,5,2631,142.0,51.5,42.333333,46.0,65.0,Female
1,1524,8780,2020.0,1219,9,2001,4,5236,37.0,39.125,29.333333,58.0,34.0,Male
2,155,4176,942.0,787,7,1968,1,3079,376.0,42.5,37.0,46.0,37.0,Male
3,1073,9855,3440.0,2623,12,2002,2,5342,19.0,35.222222,21.5,33.0,23.0,Male
4,1317,7688,3835.0,3149,8,1988,4,2536,40.0,45.25,45.0,36.0,39.0,Male


In [4]:
df["Lead"]. replace("Female", 1, inplace = True )
df["Lead"]. replace ("Male", 0, inplace = True )

In [5]:
# baseline model validation performance
X_train, X_val, y_train, y_val = train_test_split(df.drop(columns=['Lead']), df['Lead'], test_size = .2)
model=GradientBoostingClassifier()
model.fit(X_train,y_train)
prediction=model.predict(X_val)
performace(y_val,prediction)

confusionmatrix:
[[152   8]
 [ 22  26]]
accuracy:0.8557692307692307
precision:0.7647058823529411
recall:0.5416666666666666
F1-score:0.6341463414634146


# Grid Search

In [9]:
from sklearn.model_selection import GridSearchCV

In [10]:
# define the model with default hyperparameters
model = GradientBoostingClassifier()

# define the grid of values to search
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500, 1000]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1,0.2,0.5,1.0]
grid['max_features']=[3,5,7,8]
grid['subsample'] = [0.5,0.7,1]
grid['max_depth'] = [3,5,7,8]

In [11]:
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')
# execute the grid search
grid_result = grid_search.fit(features, df['Lead'])
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.886128 using {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 8, 'n_estimators': 500, 'subsample': 1}
0.755536 (0.004570) with: {'learning_rate': 0.0001, 'max_depth': 3, 'max_features': 3, 'n_estimators': 10, 'subsample': 0.5}
0.755536 (0.004570) with: {'learning_rate': 0.0001, 'max_depth': 3, 'max_features': 3, 'n_estimators': 10, 'subsample': 0.7}
0.755536 (0.004570) with: {'learning_rate': 0.0001, 'max_depth': 3, 'max_features': 3, 'n_estimators': 10, 'subsample': 1}
0.755536 (0.004570) with: {'learning_rate': 0.0001, 'max_depth': 3, 'max_features': 3, 'n_estimators': 50, 'subsample': 0.5}
0.755536 (0.004570) with: {'learning_rate': 0.0001, 'max_depth': 3, 'max_features': 3, 'n_estimators': 50, 'subsample': 0.7}
0.755536 (0.004570) with: {'learning_rate': 0.0001, 'max_depth': 3, 'max_features': 3, 'n_estimators': 50, 'subsample': 1}
0.755536 (0.004570) with: {'learning_rate': 0.0001, 'max_depth': 3, 'max_features': 3, 'n_estimators': 100, 'subsample': 0.5}
0.755536 (0.

In [12]:
#validation performance
X_train, X_val, y_train, y_val = train_test_split(df.drop(columns=['Lead']), df['Lead'], test_size = .2)
model=GradientBoostingClassifier(learning_rate= 0.1, max_depth= 5, max_features=8, n_estimators=500, subsample= 1)
model.fit(X_train,y_train)
prediction=model.predict(X_val)
performace(y_val,prediction)

confusionmatrix:
[[157   7]
 [ 14  30]]
accuracy:0.8990384615384616
precision:0.8108108108108109
recall:0.6818181818181818
F1-score:0.7407407407407407
