## Read data

In [29]:
import pandas as pd
import numpy as np
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
%matplotlib notebook

def data_wrangle(data_path):
    names_list=["sample_code","clump thickness","uniformity of cell size",
                "uniformity of cell shape","marginal adhesion",
                "single epithelial cell size","bare nuclei","bland chromatin",
                "normal nucleoli","mitoses","class"]
    orignal_data=pd.read_csv(data_path,names=names_list)
    data=orignal_data.dropna()
    data['class'] = data['class'].map({2:1,4:0})
    data=data.replace("?",np.nan)
    imp= SimpleImputer(missing_values=np.nan,strategy='mean')
    data=imp.fit_transform(data)
    new_data=pd.DataFrame(data,columns=names_list)
    new_data.drop(["sample_code"],axis=1,inplace=True)
    return new_data

In [30]:
data=data_wrangle("breast-cancer-wisconsin.data")
all_cols = list(data.columns)       # .columns gives columns names in data 
print(all_cols)

['clump thickness', 'uniformity of cell size', 'uniformity of cell shape', 'marginal adhesion', 'single epithelial cell size', 'bare nuclei', 'bland chromatin', 'normal nucleoli', 'mitoses', 'class']


In [31]:
feature_cols=all_cols[:-1]
y = data["class"]                      # M or B 
x = data[feature_cols]
x.head()

Unnamed: 0,clump thickness,uniformity of cell size,uniformity of cell shape,marginal adhesion,single epithelial cell size,bare nuclei,bland chromatin,normal nucleoli,mitoses
0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0
1,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0
2,3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0
3,6.0,8.0,8.0,1.0,3.0,4.0,3.0,7.0,1.0
4,4.0,1.0,1.0,3.0,2.0,1.0,3.0,1.0,1.0


## Data Split and Normalization

In [32]:
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold,GridSearchCV
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
std_x_train=scaler.fit_transform(x_train)
std_x_test=scaler.fit_transform(x_test)

std_x_train=pd.DataFrame(std_x_train,columns=x_train.columns)
std_x_test=pd.DataFrame(std_x_test,columns=x_test.columns)

## Build Model

In [35]:
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier,GradientBoostingClassifier

# metrics
from sklearn.metrics import classification_report

* Build XGBoost model

In [36]:
xgboost = XGBClassifier()
xg_params = {
    # Parameters that we are going to tune.
    'max_depth':[20,30,40],
    'n_estimators' : [50,40,60],
    'learning_rate':[0.05,0.01],
    'min_child_weight': [1,2,3],
    'subsample': [1],
    'colsample_bytree': [1],
}
xg_clf = GridSearchCV(xgboost, xg_params)
xg_clf.fit(std_x_train, y_train)

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_cat_to_...
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     n

In [38]:
print(classification_report(y_test, xg_clf.predict(std_x_test)))

              precision    recall  f1-score   support

         0.0       0.95      0.93      0.94        67
         1.0       0.97      0.98      0.97       143

    accuracy                           0.96       210
   macro avg       0.96      0.95      0.96       210
weighted avg       0.96      0.96      0.96       210

