In [1]:
from math import ceil

from joblib import dump, load

import pandas as pd

import numpy as np

from scipy.stats import randint, uniform
import random as ran

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn_pandas import CategoricalImputer

from sklearn.metrics import f1_score, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import f_classif, chi2, SelectKBest, SelectPercentile, SelectFpr, SelectFromModel
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import category_encoders as ce

import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning, module='sklearn')

In [2]:
# Feature Selection Pipelines
class ColumnSelector(BaseEstimator, TransformerMixin):
            """
            Transformer to select a group of columns based on a list.
            """
            def __init__(self, cols):
                self.cols = cols

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                return X[self.cols]

In [None]:
RFClaM = Pipeline( [
    # ("knearest", KNeighborsColumn(n_neighbors=40, distFeatures=['longitude', 'latitude', "date_recorded"])),
    # ("cord", ce.OrdinalEncoder()),
    ("nimp", SimpleImputer(missing_values=np.NaN, strategy='constant', fill_value = 0.0)),
    ("nmod", SelectKBest(score_func=f_classif, k=10)),
    ("RF", RFCla)
    ] )

In [None]:
model = load('model2.joblib')

In [None]:
healthyNColumns = None
healthyCColumns = None
healthyDColumns = None
chunksize = 10000

top = ceil(1183748 / chunksize)

model = None

for i, (cats, nums, dates) in enumerate(
                         zip(pd.read_csv("bosch-production-line-performance/train_categorical.csv", chunksize = chunksize, dtype = str)
                            ,pd.read_csv("bosch-production-line-performance/train_numeric.csv", chunksize = chunksize)
                            ,pd.read_csv("bosch-production-line-performance/train_date.csv", chunksize = chunksize) )):
    print("Round ",i+1,"/",top)
    
    if healthyCColumns is None:
        healthyCColumns = list(cats.isnull().sum()[cats.isnull().sum() <= chunksize * 1].keys())
    if healthyNColumns is None:
        healthyNColumns = list(nums.isnull().sum()[nums.isnull().sum() <= chunksize * 1].keys())
    if healthyDColumns is None:
        healthyDColumns = list(dates.isnull().sum()[dates.isnull().sum() <= chunksize * 1].keys())
    
    ITrain = pd.concat([cats[healthyCColumns[1:]]
                       ,nums[healthyNColumns[:-1]]
                       ,dates[healthyDColumns[1:]]], axis=1)
    OTrain = nums['Response']
    
    if model is None:
        numeric_features = ITrain.select_dtypes('number').columns.tolist()

        numPipe = Pipeline( [
            ("ncol", ColumnSelector(numeric_features)),
            ("nimp", SimpleImputer(missing_values=np.NaN, strategy='constant', fill_value = 9999999)),
            #("nmod", SelectFromModel(RandomForestClassifier(n_jobs=-1, n_estimators=100), threshold='median'))
            ("nkbe", SelectKBest(score_func=f_classif, k=(len(numeric_features) // 10))) # Top 50%
            # ("nfpr", SelectFpr(score_func=f_classif, alpha=.001))
            ] )

        categorical_features = ITrain.describe(exclude='number').columns.tolist()

        catPipe = Pipeline( [
            ("ccol", ColumnSelector(categorical_features)),
            ("cimp", CategoricalImputer(strategy='constant', fill_value='NaN')),
            ("cord", ce.OrdinalEncoder()),
            #("cmod", SelectFromModel(RandomForestClassifier(n_jobs=-1, n_estimators=100), threshold='median'))
            ("ckbe", SelectKBest(score_func=chi2, k=(len(categorical_features) // 10)))
            ] )

        feats = FeatureUnion([('nums', numPipe), ('cats', catPipe)])
        
        RFCla = Pipeline( [
            # ("knearest", KNeighborsColumn(n_neighbors=40, distFeatures=['longitude', 'latitude', "date_recorded"])),
            ("feat", feats),
            ("RF", RandomForestClassifier(n_jobs=-1, n_estimators=100))
            ] )

        model = RFCla
    
    inputTrain, inputValidate, outputTrain, outputValidate =\
        train_test_split(ITrain, OTrain, train_size=0.8, test_size=0.2)
    
    model.fit(inputTrain, outputTrain)
    
    score = f1_score(outputValidate, model.predict(inputValidate))
    print('Validation F1Score', score)
    
    dump(model, 'modelCH'+str(i)+'.joblib')
    
    del ITrain, OTrain, inputTrain, inputValidate, outputTrain, outputValidate

Round  1 / 119


In [None]:
fullOutput = pd.DataFrame(columns = ['Id', 'Response'])

for i, (cats, nums, dates) in enumerate(
                         zip(pd.read_csv("bosch-production-line-performance/test_categorical.csv", chunksize = chunksize, dtype = str)
                            ,pd.read_csv("bosch-production-line-performance/test_numeric.csv", chunksize = chunksize)
                            ,pd.read_csv("bosch-production-line-performance/test_date.csv", chunksize = chunksize) )):
    print("Round ",i+1,"/",top)
    
    cats.fillna("NaN")
    nums.fillna(9999999)
    dates.fillna(9999999)
    
    ITest = pd.concat([nums[healthyNColumns[:-1]]
                      ,cats[healthyCColumns[1:]]
                      ,dates[healthyDColumns[1:]]], axis=1)
    
    outputDf = pd.DataFrame(ITest['Id'])
    outputDf['Response'] = model.predict(ITest)
    
    fullOutput = pd.concat([fullOutput, outputDf])
    
    del ITest

fullOutput.to_csv("RFSubmission3.csv", index=False)