<a href="https://colab.research.google.com/github/Bulat27/ML_Exercise_Boosting/blob/master/Boosting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading the datasets

In [24]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [2]:
url_train = 'https://raw.githubusercontent.com/Bulat27/ML_Exercise_Boosting/master/train.csv'
url_test = 'https://raw.githubusercontent.com/Bulat27/ML_Exercise_Boosting/master/test.csv'

In [6]:
df_train = pd.read_csv(url_train)
df_test = pd.read_csv(url_test)

In [7]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Data preprocessing

In [9]:
y_train = df_train["Survived"]
df_train.drop(labels = "Survived", axis = 1, inplace = True)

In [11]:
df_total = df_train.append(df_test)

In [13]:
unnecessary_columns = ["Name", "Age", "SibSp", "Ticket", "Cabin", "Parch", "Embarked"]
df_total.drop(labels = unnecessary_columns, axis = 1, inplace = True)

In [14]:
df_total = pd.get_dummies(df_total, columns=["Sex"])
df_total.fillna(value=0.0, inplace=True)

In [15]:
X_train = df_total.values[0:891]
X_test = df_total.values[891:]

In [16]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Splitting the dataset

In [17]:
state = 27  
test_size = 0.27  
  
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,  
    test_size=test_size, random_state=state)

# Testing different learning rates

In [22]:
learning_rates = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in learning_rates:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy (validation): {0:.3f}".format(gb_clf.score(X_val, y_val)))

Learning rate:  0.05
Accuracy (training): 0.789
Accuracy (validation): 0.747
Learning rate:  0.075
Accuracy (training): 0.795
Accuracy (validation): 0.759
Learning rate:  0.1
Accuracy (training): 0.803
Accuracy (validation): 0.759
Learning rate:  0.25
Accuracy (training): 0.826
Accuracy (validation): 0.747
Learning rate:  0.5
Accuracy (training): 0.842
Accuracy (validation): 0.772
Learning rate:  0.75
Accuracy (training): 0.863
Accuracy (validation): 0.751
Learning rate:  1
Accuracy (training): 0.855
Accuracy (validation): 0.793


# Choosing the model with the best performance

In [23]:
gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=1, max_features=2, max_depth=2, random_state=0)
gb_clf2.fit(X_train, y_train)
predictions = gb_clf2.predict(X_val)

print("Confusion Matrix:")
print(confusion_matrix(y_val, predictions))

print("Classification Report")
print(classification_report(y_val, predictions))

Confusion Matrix:
[[128  21]
 [ 29  63]]
Classification Report
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       149
           1       0.75      0.68      0.72        92

    accuracy                           0.79       241
   macro avg       0.78      0.77      0.78       241
weighted avg       0.79      0.79      0.79       241



# Testing the XGBoost Classifier

In [26]:
xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)

XGBClassifier()

In [27]:
xgb_score = xgb_classifier.score(X_val, y_val)

In [28]:
xgb_score

0.7966804979253111