## Software release Machine learning model using XGboost

* The model is binary Classifier model which predicts if a give software release will be a high Risk(i.e more bugs) or less risk(less bugs)
* The machine learning model can help release managers and project managers plan their software releases much better based on the resources available.

In [None]:
# import all the necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost.sklearn import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

In [None]:
# read the data into a pandas dataframe
dataset = pd.read_excel('../input/TrainingData.xlsx')
dataset.columns

In [None]:
# there are a few columns with zeros , and id columns .. Dropping them
cols = [0,1,2,3,13,15,16,19,20]
dataset.drop(dataset.columns[cols],axis=1,inplace=True)

In [None]:
# explore the dataset
dataset.head()

In [None]:
dataset.columns

In [None]:
#features and target label
X = dataset.iloc[:,0:12].values
y = dataset.iloc[:,13].values

In [None]:
#Splitting the data into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)

In [None]:
# checking to confirm if there is enough data in test and train
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# feature scaling as some of the featrues were in high thousands
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [None]:

from xgboost.sklearn import XGBClassifier
model = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
model.fit(X_train, y_train,eval_metric='auc')

In [None]:
# create a function to do cross validation and print the accuracy score.
def modelfit(alg, X, y,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X, y)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X, y,eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(X)
    dtrain_predprob = alg.predict_proba(X)[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(y, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(y, dtrain_predprob))

In [None]:
# fitting the model
modelfit(model, X_train, y_train)

In [None]:
#predict the target for the test and check the accuracy
y_pred = model.predict(X_test)
metrics.accuracy_score(y_pred,y_test)

In [None]:
#print out the confusion matrix.
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
cm