In [1]:
from scipy.stats import randint, uniform
import random as ran

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn_pandas import CategoricalImputer

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import f_classif, chi2, SelectKBest, SelectPercentile, SelectFpr, SelectFromModel
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import category_encoders as ce

import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning, module='sklearn')

In [8]:
from math import ceil

from joblib import dump, load

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score

import numpy as np

In [3]:
reses = pd.read_csv("bosch-production-line-performance/train_numeric.csv", usecols=["Response"])
prior = reses.sum().values[0] / len(reses)
del reses

In [2]:
# Feature Selection Pipelines
class ColumnSelector(BaseEstimator, TransformerMixin):
            """
            Transformer to select a group of columns based on a list.
            """
            def __init__(self, cols):
                self.cols = cols

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                return X[self.cols]

In [None]:
RFClaM = Pipeline( [
    # ("knearest", KNeighborsColumn(n_neighbors=40, distFeatures=['longitude', 'latitude', "date_recorded"])),
    # ("cord", ce.OrdinalEncoder()),
    ("nimp", SimpleImputer(missing_values=np.NaN, strategy='constant', fill_value = 0.0)),
    ("nmod", SelectKBest(score_func=f_classif, k=10)),
    ("RF", RFCla)
    ] )

In [None]:
model = load('model2.joblib')

In [16]:
healthyNColumns = None
healthyCColumns = None
healthyDColumns = None
chunksize = 100000

top = ceil(1183748 / chunksize)

model = None

for i, (cats, nums, dates) in enumerate(
                         zip(pd.read_csv("bosch-production-line-performance/train_categorical.csv", chunksize = chunksize, dtype = str)
                            ,pd.read_csv("bosch-production-line-performance/train_numeric.csv", chunksize = chunksize)
                            ,pd.read_csv("bosch-production-line-performance/train_date.csv", chunksize = chunksize) )):
    print("Round ",i+1,"/",top)
    
    if healthyCColumns is None:
        healthyCColumns = list(cats.isnull().sum()[cats.isnull().sum() <= chunksize * .5].keys())
    if healthyNColumns is None:
        healthyNColumns = list(nums.isnull().sum()[nums.isnull().sum() <= chunksize * 1].keys())
    if healthyDColumns is None:
        healthyDColumns = list(dates.isnull().sum()[dates.isnull().sum() <= chunksize * 1].keys())
    
    ITrain = pd.concat([cats[healthyCColumns[1:]]
                       ,nums[healthyNColumns[:-1]]
                       ,dates[healthyDColumns[1:]]], axis=1)
    OTrain = nums['Response']
    
    if model is None:
        numeric_features = ITrain.select_dtypes('number').columns.tolist()

        numPipe = Pipeline( [
            ("ncol", ColumnSelector(numeric_features)),
            ("nsca", StandardScaler()),
            ("nimp", SimpleImputer(missing_values=np.NaN, strategy='constant', fill_value = 9999999)),
            #("nmod", SelectFromModel(RandomForestClassifier(n_jobs=-1, n_estimators=100), threshold='median'))
            ("nkbe", SelectKBest(score_func=f_classif, k=(len(numeric_features) // 15)))
            # ("nfpr", SelectFpr(score_func=f_classif, alpha=.001))
            ] )

        categorical_features = ITrain.describe(exclude='number').columns.tolist()

        catPipe = Pipeline( [
            ("ccol", ColumnSelector(categorical_features)),
            ("cimp", CategoricalImputer(strategy='constant', fill_value='NaN')),
            ("cord", ce.OrdinalEncoder()),
            #("cmod", SelectFromModel(RandomForestClassifier(n_jobs=-1, n_estimators=100), threshold='median'))
            ("ckbe", SelectKBest(score_func=chi2, k=(len(categorical_features) // 15)))
            ] )

        feats = FeatureUnion([('nums', numPipe), ('cats', catPipe)])
        
        RFCla = Pipeline( [
            # ("knearest", KNeighborsColumn(n_neighbors=40, distFeatures=['longitude', 'latitude', "date_recorded"])),
            ("feat", feats),
            ("RF", RandomForestClassifier(n_jobs=-1, n_estimators=100))
            ] )

        model = RFCla
    
    inputTrain, inputValidate, outputTrain, outputValidate =\
        train_test_split(ITrain, OTrain, train_size=0.8, test_size=0.2)
    
    model.fit(inputTrain, outputTrain)
    
    score = f1_score(outputValidate, model.predict(inputValidate))
    print('Validation F1Score', score)
    
    dump(model, 'modelCH'+str(i)+'.joblib')
    
    del ITrain, OTrain, inputTrain, inputValidate, outputTrain, outputValidate

Round  1 / 12


  return self.partial_fit(X, y)
  updated_mean = (last_sum + new_sum) / updated_sample_count
  new_unnormalized_variance = np.nanvar(X, axis=0) * new_sample_count
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


KeyboardInterrupt: 

In [None]:
fullOutput = pd.DataFrame(columns = ['Id', 'Response'])

for i, (cats, nums, dates) in enumerate(
                         zip(pd.read_csv("bosch-production-line-performance/test_categorical.csv", chunksize = chunksize, dtype = str)
                            ,pd.read_csv("bosch-production-line-performance/test_numeric.csv", chunksize = chunksize)
                            ,pd.read_csv("bosch-production-line-performance/test_date.csv", chunksize = chunksize) )):
    print("Round ",i+1,"/",top)
    
    cats.fillna("NaN")
    nums.fillna(9999999)
    dates.fillna(9999999)
    
    ITest = pd.concat([nums[healthyNColumns[:-1]]
                      ,cats[healthyCColumns[1:]]
                      ,dates[healthyDColumns[1:]]], axis=1)
    
    outputDf = pd.DataFrame(ITest['Id'])
    outputDf['Response'] = model.predict(ITest)
    
    fullOutput = pd.concat([fullOutput, outputDf])
    
    del ITest

fullOutput.to_csv("RFSubmission3.csv", index=False)

In [None]:
import xgboost as xgb

params = {
    'colsample_bytree': 0.7,
    'subsample': 0.7,
    'learning_rate': 0.1,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 2,
    'eval_metric': 'auc',
    'base_score': prior
}

healthyNColumns = None
healthyCColumns = None
healthyDColumns = None
chunksize = 150000

top = ceil(1183748 / chunksize)

model = None
res = None

for i, (nums, dates) in enumerate(
                         zip(#pd.read_csv("bosch-production-line-performance/train_categorical.csv", chunksize = chunksize, dtype = str)
                            pd.read_csv("bosch-production-line-performance/train_numeric.csv", chunksize = chunksize)
                            ,pd.read_csv("bosch-production-line-performance/train_date.csv", chunksize = chunksize) )):
    print("Round ",i+1,"/",top)

    #if healthyCColumns is None:
    #    healthyCColumns = list(cats.isnull().sum()[cats.isnull().sum() <= chunksize * 0].keys())
    if healthyNColumns is None:
        healthyNColumns = list(nums.isnull().sum()[nums.isnull().sum() <= chunksize * 1].keys())
    if healthyDColumns is None:
        healthyDColumns = list(dates.isnull().sum()[dates.isnull().sum() <= chunksize * 1].keys())
    
    if model is None:
        model = xgb.XGBClassifier(
                colsample_bytree = 0.7,
                subsample = 0.7,
                learning_rate = 0.1,
                objective = 'binary:logistic',
                max_depth = 4,
                num_parallel_tree = 1,
                min_child_weight = 2,
                eval_metric = 'auc',
                base_score = prior)
    
    ITrain = pd.concat([#cats[healthyCColumns[1:]]
                       nums[healthyNColumns[:-1]]
                       ,dates[healthyDColumns[1:]]], axis=1)
    OTrain = nums['Response']
    
    inputTrain, inputValidate, outputTrain, outputValidate =\
        train_test_split(ITrain, OTrain, train_size=0.8, test_size=0.2)
    
    #dtrain = xgb.DMatrix(inputTrain, label=outputTrain)
    
    #if res is None:
    #    res = xgb.cv(params, dtrain, num_boost_round=10, nfold=4, stratified=True,
    #                 early_stopping_rounds=2, verbose_eval=1, show_stdv=True)
    
    model.fit(inputTrain, outputTrain)
    
    score = f1_score(outputValidate, model.predict(inputValidate))
    print('Validation F1Score', score)
    
    dump(model, 'modelXGB'+str(i)+'.joblib')
    
    del ITrain, OTrain, inputTrain, inputValidate, outputTrain, outputValidate

Round  1 / 8
Validation F1Score 0.05128205128205128
Round  2 / 8
Validation F1Score 0.1111111111111111
Round  3 / 8
Validation F1Score 0.06629834254143648
Round  4 / 8
