In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import joblib
import os

from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

import logging

In [2]:
logger = logging.getLogger('week14')
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
fh = logging.FileHandler('../logs/week14.log')
fh.setLevel(logging.DEBUG)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
fh.setFormatter(formatter)
# add the handlers to logger
logger.addHandler(ch)
logger.addHandler(fh)

# 'application' code
# logger.debug('debug message')
# logger.info('info message')
# logger.warning('warn message')
# logger.error('error message')
# logger.critical('critical message')

In [3]:
now = dt.datetime.today().strftime('%Y%m%d')
now

'20201201'

# Load data

In [4]:
# loading raw data from sklearn
df_digits = load_digits()

# creating feature and target data
X = df_digits['data']
y = df_digits['target']


assert X.shape == (1797, 64)
assert y.shape[0] == 1797

assert X.shape[0] == y.shape[0]

In [7]:
# Create train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
logger.info('Train test set created...')

2020-12-01 20:59:26,020 - week14 - INFO - Train test set created...


In [None]:
# TODO: do more EDAV

In [9]:
pd.DataFrame(X_test).to_csv('../')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.0,0.0,8.0,12.0,14.0,12.0,3.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,8.0,14.0,16.0,4.0,0.0,0.0
1,0.0,3.0,16.0,15.0,1.0,0.0,0.0,0.0,0.0,10.0,...,16.0,0.0,0.0,2.0,14.0,15.0,11.0,8.0,3.0,0.0
2,0.0,0.0,2.0,13.0,13.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,15.0,13.0,1.0,0.0,0.0
3,0.0,0.0,3.0,16.0,12.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,13.0,9.0,0.0,0.0,0.0
4,0.0,0.0,6.0,16.0,13.0,12.0,14.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10.0,4.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,0.0,0.0,0.0,9.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.0,16.0,2.0,0.0,0.0
356,0.0,2.0,14.0,16.0,6.0,0.0,0.0,0.0,0.0,12.0,...,7.0,0.0,0.0,2.0,15.0,16.0,16.0,15.0,8.0,0.0
357,0.0,0.0,10.0,16.0,14.0,2.0,0.0,0.0,0.0,3.0,...,1.0,0.0,0.0,0.0,10.0,16.0,14.0,3.0,0.0,0.0
358,0.0,0.0,7.0,15.0,16.0,10.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,8.0,16.0,16.0,12.0,0.0,0.0


# Building a model

In [None]:
# define a classifier - here I am using a Random Forest model
clf = RandomForestClassifier(random_state=42)

pipe = Pipeline([('classifier', clf)])

# Parameters of the pipeline
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [2, 4, 6]
}
logger.info('grid search started')
search = GridSearchCV(pipe,  param_grid, n_jobs=-1, cv=5, return_train_score=True)
logger.info('grid search ended')



# Train

In [None]:
search.fit(X_train, y_train)

In [None]:
search.best_params_

# Saving model

In [None]:
model_dir = '../model'
model_name = 'clf_rf_{}'.format(now)


# joblib.dump(search.best_estimator_, os.path.join(model_dir, model_name))