### Binay Classification
[bosch](https://www.kaggle.com/c/bosch-production-line-performance/data)

In addition to being one of the largest datasets (in terms of number of features) ever hosted on Kaggle,
the ground truth for this competition is highly imbalanced. Together, these two attributes are expected 
to make this a challenging problem.

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb



## Useful links in Kaggle

### [Production map](https://www.kaggle.com/c/bosch-production-line-performance/forums/t/23042/production-map)

### [Expeditive exploration+models on data](https://www.kaggle.com/c/bosch-production-line-performance/forums/t/22909/expeditive-exploration-models-on-data)

### [Visulization of data exploration](https://www.kaggle.com/dollardollar/bosch-production-line-performance/eda-of-important-features/comments)

### [H20](https://www.kaggle.com/c/bosch-production-line-performance/forums/t/23146/xgboost-simple-starter-auc-0-712)

### [DataSize](https://www.kaggle.com/c/bosch-production-line-performance/forums/t/22908/datasets-size-uncompressed-14-3gb)

### [Turn categorical into numeric](https://www.kaggle.com/c/bosch-production-line-performance/forums/t/23290/turn-categorical-into-numeric)

## [Magic feature](https://www.kaggle.com/c/bosch-production-line-performance/forums/t/24065/the-magical-feature-from-lb-0-3-to-0-4/137761#post137761)

In [None]:
# ratio between positive samples and negative samples
ratio=6879/1176868

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


feature_names = ['L3_S38_F3960', 'L3_S33_F3865', 'L3_S38_F3956', 'L3_S33_F3857',
       'L3_S29_F3321', 'L1_S24_F1846', 'L3_S32_F3850', 'L3_S29_F3354',
       'L3_S29_F3324', 'L3_S35_F3889', 'L0_S1_F28', 'L1_S24_F1844',
       'L3_S29_F3376', 'L0_S0_F22', 'L3_S33_F3859', 'L3_S38_F3952', 
       'L3_S30_F3754', 'L2_S26_F3113', 'L3_S30_F3759', 'L0_S5_F114']

In [None]:
numeric_cols = pd.read_csv("bosch/train_numeric.csv", nrows = 1).columns.values
imp_idxs = [np.argwhere(feature_name == numeric_cols)[0][0] for feature_name in feature_names]
train = pd.read_csv("bosch/train_numeric.csv", 
                index_col = 0, header = 0, usecols = [0, len(numeric_cols) - 1] + imp_idxs)
train = train[feature_names + ['Response']]

In [None]:
X_neg, X_pos = train[train['Response'] == 0].iloc[:, :-1], train[train['Response']==1].iloc[:, :-1]

In [None]:
BATCH_SIZE = 5
train_batch =[pd.melt(train[train.columns[batch: batch + BATCH_SIZE].append(np.array(['Response']))], 
                      id_vars = 'Response', value_vars = feature_names[batch: batch + BATCH_SIZE])
              for batch in list(range(0, train.shape[1] - 1, BATCH_SIZE))]

In [None]:
FIGSIZE = (12,16)
_, axs = plt.subplots(len(train_batch), figsize = FIGSIZE)
plt.suptitle('Univariate distributions')
for data, ax in zip(train_batch, axs):
    sns.violinplot(x = 'variable',  y = 'value', hue = 'Response', data = data, ax = ax, split =True)

In [None]:
non_missing = pd.DataFrame(pd.concat([(X_neg.count()/X_neg.shape[0]).to_frame('negative samples'),
                                      (X_pos.count()/X_pos.shape[0]).to_frame('positive samples'),  
                                      ], 
                       axis = 1))
non_missing_sort = non_missing.sort_values(['negative samples'])
non_missing_sort.plot.barh(title = 'Proportion of non-missing values', figsize = FIGSIZE)
plt.gca().invert_yaxis()

In [None]:
FIGSIZE = (13,4)
_, (ax1, ax2) = plt.subplots(1,2, figsize = FIGSIZE)
MIN_PERIODS = 100

triang_mask = np.zeros((X_pos.shape[1], X_pos.shape[1]))
triang_mask[np.triu_indices_from(triang_mask)] = True

ax1.set_title('Negative Class')
sns.heatmap(X_neg.corr(min_periods = MIN_PERIODS), mask = triang_mask, square=True,  ax = ax1)

ax2.set_title('Positive Class')
sns.heatmap(X_pos.corr(min_periods = MIN_PERIODS), mask = triang_mask, square=True,  ax = ax2)



In [None]:
sns.heatmap(X_pos.corr(min_periods = MIN_PERIODS) -X_neg.corr(min_periods = MIN_PERIODS), 
             mask = triang_mask, square=True)


In [None]:
nan_pos, nan_neg = np.isnan(X_pos), np.isnan(X_neg)

triang_mask = np.zeros((X_pos.shape[1], X_pos.shape[1]))
triang_mask[np.triu_indices_from(triang_mask)] = True

FIGSIZE = (13,4)
_, (ax1, ax2) = plt.subplots(1,2, figsize = FIGSIZE)
MIN_PERIODS = 100

ax1.set_title('Negative Class')
sns.heatmap(nan_neg.corr(),   square=True, mask = triang_mask, ax = ax1)

ax2.set_title('Positive Class')
sns.heatmap(nan_pos.corr(), square=True, mask = triang_mask,  ax = ax2)

In [None]:
sns.heatmap(nan_neg.corr() - nan_pos.corr(), mask = triang_mask, square=True)

## Train all numeric features

In [2]:
train = pd.read_csv("/Users/lidong/Downloads/bosch/train_numeric.csv")

In [3]:
train.shape

(1183747, 970)

In [6]:
train['L3_S47_F4163'].describe()

count    59955.000000
mean        -0.000168
std          0.103590
min         -0.565000
25%         -0.067500
50%          0.001000
75%          0.072000
max          0.371000
Name: L3_S47_F4163, dtype: float64

In [None]:
train_test = pd.read_csv("/Users/lidong/Downloads/bosch/train_test.csv")

In [None]:
train_test.shape

In [None]:
trainp = pd.merge(train, train_test, on='Id')

In [None]:
trainp.shape

In [None]:
trainp.head()

In [None]:
del train
del train_test
import gc
gc.collect()

### Train

In [None]:
import xgboost as xgb
Y=trainp['Response_x']
listp=[x for x in trainp.columns if x not in ["Response_x","Response_y"]]
len(listp)
X=trainp[listp]

import time
startt = time.time()
gbm = xgb.XGBClassifier(max_depth=2, n_estimators=2, learning_rate=0.5).fit(X, Y)
endt = time.time()
print((endt-startt)/60)



In [None]:
del trainp
del Y
del X

In [None]:
import gc
gc.collect()

In [None]:
test = pd.read_csv("/Users/lidong/Downloads/bosch/test_numeric.csv")

In [None]:
testX = test[listp]

In [None]:
import time
import math
startt = time.time()
predictions = gbm.predict(testX)
endt = time.time()
print((endt-startt)/60)

In [None]:
result = pd.DataFrame({'Id':test['Id'],'Response':predictions})

In [None]:
result.to_csv('submitResponse.csv',index=False)

In [None]:
resultp = pd.read_csv('submitResponse.csv')

In [None]:
resultp.head()

In [None]:
## max_depth=3, n_estimators=30, learning_rate=0.05, MCC:  2.3118949470266887, time: 13.2min
## max_depth=5, n_estimators=30, learning_rate=0.05, MCC:  2.6920021339353273, time: 19.7min
## max_depth=6, n_estimators=30, learning_rate=0.05, MCC:  2.84,               time: 23.3min
## max_depth=7, n_estimators=30, learning_rate=0.05, MCC:  3.13                time: 27.2min

## Feature Importance

In [None]:
xgb.plot_importance(gbm)

### Predict

In [None]:
import time
import math
startt = time.time()
predictions = gbm.predict(X)
sumn=0
TP=0
TN=0
FP=0
FN=0
for i in range(len(predictions)):
    if predictions[i]==0:
        if predictions[i]!=Y.iloc[i]:
            FN = FN+1
        else:
            TN = TN+1
    else:
        if predictions[i]!=Y.iloc[i]:
            FP = FP+1
        else:
            TP = TP+1

endt = time.time()
print((endt-startt)/60)
sumup=TP*TN-FP*FN
sumdown=(TP+FP)*(TP+FN)*(FN+FP)*(TN+FN)*1.0
print("MCC: ",sumup/math.sqrt(sumdown))

## Train all categorical feature

In [None]:
pd.read_csv?

In [None]:
pd.read_csv?

In [None]:
train = pd.read_csv("bosch/train_categorical.csv",dtype=float)

In [None]:
train.head()

In [None]:
train = train.fillna(0)

In [None]:
pd.read_csv?

In [None]:
train['Id'] = train['Id'].astype(int)

In [None]:
trainl = pd.read_csv("bosch/train_numeric.csv",usecols=['Response','Id'])

In [None]:
Y=trainl['Response']

In [None]:
train.shape

In [None]:
sum(Y[500:600])

In [None]:
from sklearn import tree
from sklearn import linear_model

In [None]:
tree.DecisionTreeClassifier?

In [None]:
listp=[x for x in train.columns if x not in ["Response","Id"]]
len(listp)
#train[listp]

import time
startt = time.time()
# clf = tree.DecisionTreeClassifier(max_depth=3)
# clf = clf.fit(train[listp], Y)

clf = linear_model.SGDClassifier()
clf.fit(train[listp].iloc[500:600], Y[500:600])

endt = time.time()
print((endt-startt)/60)

In [None]:
import xgboost as xgb
import time
startt = time.time()
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=3, learning_rate=0.1).fit(train[listp].iloc[500:600], Y[500:600])
endt = time.time()
print((endt-startt)/60)

## Get positive samples

In [None]:
numericnum=970
categorical=2140
date=1156

In [None]:
import warnings
warnings.filterwarnings('ignore')

def get_positive(chunksize):
    reader_numeric = pd.read_csv('bosch/train_numeric.csv', chunksize=chunksize)
    reader_categorical = pd.read_csv('bosch/train_categorical.csv', chunksize=chunksize)
    reader_date = pd.read_csv('bosch/train_date.csv', chunksize=chunksize)
    reader = zip(reader_numeric, reader_categorical, reader_date)
    first = True
    for numeric, categorical, date in reader:
        categorical.drop('Id', axis=1, inplace=True)
        date.drop('Id', axis=1, inplace=True)
        data = pd.concat([numeric, categorical, date], axis=1)
        positive_data = data[data.Response == 1]
        if first:
            positive = positive_data.copy()
            first = False
        else:
            positive = pd.concat([positive, positive_data])
        print(positive_data.shape, positive.shape)
    return positive

positive = get_positive(10000)