In [1]:
import pandas as pd
from math import sqrt, fabs, exp
import matplotlib.pyplot as plot
from sklearn.linear_model import enet_path
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve
from sklearn.model_selection import train_test_split
from sklearn import ensemble
import numpy as np

In [2]:
dfX = pd.read_csv("X_train.csv")
dfTest = pd.read_csv("X_test.csv")
dfY = pd.read_csv("y_train.csv")
XPred = dfTest.iloc[:, 1:].values
X = dfX.iloc[:, 1:].values
y = dfY.iloc[:, 1:].values

# Pipeline for oversampling-undersampling

In [29]:
# Oversample with SMOTE and random undersample for imbalanced dataset
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from matplotlib import pyplot
from numpy import where

# summarize class distribution
y = y.astype(int)
list_dict = {}
id = 0
for el in y:
    list_dict[id] = int(y[id])
    id = id + 1
counter = Counter(list_dict)

# define pipeline
over = SMOTE(sampling_strategy={0: 800, 2 : 800})
under = RandomUnderSampler(sampling_strategy={1: 800})

steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X, y = pipeline.fit_resample(X, y)

# summarize the new class distribution
counter = Counter(y)
print(counter)

Counter({0: 800, 1: 800, 2: 800})


## Same code as Boost ignoring class imbalance

In [31]:
#stratified sampling by labels
nrows = len(X)
classes = [1, 2]

# sample only the zero-labeled points (then sample the remaning data points in the next for loop)
xTemp = [X[i] for i in range(nrows) if y[i] == 0]
yTemp = [y[i] for i in range(nrows) if y[i] == 0]
xTrain, xTest, yTrain, yTest = train_test_split(xTemp, yTemp, test_size=0.30, random_state=531)

# append the other data points from the other classes
for iLabel in classes:
    xTemp = [X[i] for i in range(nrows) if y[i] == iLabel]
    yTemp = [y[i] for i in range(nrows) if y[i] == iLabel]
    
    #form train and test sets on segregated subset of examples    
    xTrainTemp, xTestTemp, yTrainTemp, yTestTemp = train_test_split(xTemp, yTemp, test_size=0.30, random_state=531) 
    
    #accumulate    
    xTrain = np.append(xTrain, xTrainTemp, axis=0)    
    xTest = np.append(xTest, xTestTemp, axis=0)    
    yTrain = np.append(yTrain, yTrainTemp, axis=0)    
    yTest = np.append(yTest, yTestTemp, axis=0)

## Search for optimal hyperparameters for GBoost

In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import DataConversionWarning
import warnings
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [34]:
nTreeList = np.arange(50, 1700, 120)
max_feats = np.arange(3, 800, 100)
depths = np.array([4])
learnRate = 0.05
subSamp = 1
search_model_params = GridSearchCV(
    ensemble.GradientBoostingClassifier(learning_rate=learnRate, subsample=subSamp),{
    'n_estimators': nTreeList,
    'max_depth':depths,
    'max_features': max_feats   
    }, cv=5, return_train_score = False
)
search_model_params.fit(X, y)
print(search_model_params.best_params_)
print(search_model_params.best_score_)
predictions = search_model_params.predict(XPred)

KeyboardInterrupt: 

### Save predictions

In [None]:
dfPredictions = pd.DataFrame(predictions)
dfPredictions.index.name = "id"
df.to_csv("gboostedtask2SMOTE.csv", header = ['y'], index=True)