In [2]:
%matplotlib inline
import os
import glob
import numpy as np
from scipy import io
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
data = pd.read_csv('train.csv')

y_df = data[['molecule', 'concentration']]
X_df = data.drop(['molecule', 'concentration'], axis=1)
spectra = X_df['spectra'].values                                        
spectra = np.array([np.array(dd[1:-1].split(',')).astype(float) for dd in spectra])    
X_df['spectra'] = spectra.tolist()

# Loading wavenumbers
freqs = pd.read_csv('freq.csv')
freqs = freqs['freqs'].values

# Target for classification
molecule = y_df['molecule'].values
# Target for regression
concentration = y_df['concentration'].values
# "Raw" features
X = spectra

In [4]:
import numpy as np
import pandas as pd
from scipy import signal as signal

class FeatureExtractorClf():
    def __init__(self, window_length=25, polyorder=4):
        self.window_length = window_length
        self.polyorder = polyorder

    def fit(self, X_df, y):
        return self

    def transform(self, X_df):
        XX = np.array([np.array(dd) for dd in X_df['spectra']])
        XX = signal.savgol_filter(XX, axis=1, window_length=self.window_length, polyorder=self.polyorder, mode='nearest')
        XX = signal.detrend(XX, axis=1)
        return XX


In [5]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn import tree
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

import xgboost


    
class Classifier(BaseEstimator):
    def __init__(self):
        self.n_components = 10
        self.n_estimators = 300
        self.clf =  xgboost.XGBClassifier(max_depth=20, n_estimators=300, learning_rate=0.2)
        parameters = { 'pca__n_components': (8,10, 20, 25),
                      'clf__n_estimators':(10,30,100,200,300),
                      'clf__max_depth':(10, 15, 20, 30)
        }
        pipeline = Pipeline([
            ('pca', PCA()), 
            ('clf', self.clf)
        ])
        self.grid = GridSearchCV(pipeline, parameters, n_jobs=-1,verbose=1)
        
    def fit(self, X, y):
        self.grid.fit(X, y)
        self.clf = self.grid.best_estimator_
        self.best_parameters = self.grid.best_estimator_.get_params()
        print "Best params for XGBoost: "
        for param_name in sorted(self.best_parameters.keys()):
            print("\t%s: %r" % (param_name, self.best_parameters[param_name]))
        print ""

    def predict(self, X):
        return self.clf.predict(X)

    def predict_proba(self, X):
        return self.clf.predict_proba(X)
        

In [6]:
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

labels = np.array(['A', 'B', 'Q', 'R'])

def train_test_model_clf(X_df, y_df, skf_is, FeatureExtractor, Classifier):
    train_is, test_is = skf_is
    X_train_df = X_df.iloc[train_is].copy()                                  
    y_train_df = y_df.iloc[train_is].copy()
    y_train_clf = y_train_df['molecule'].values
    X_test_df = X_df.iloc[test_is].copy()                                    
    y_test_df = y_df.iloc[test_is].copy() 
    y_test_clf = y_test_df['molecule'].values 
    # Feature extraction
    fe_clf = FeatureExtractor()
    fe_clf.fit(X_train_df, y_train_df)
    X_train_array_clf = fe_clf.transform(X_train_df)
    X_test_array_clf = fe_clf.transform(X_test_df)
    # Train
    
    clf = Classifier()#randomForest
    clf.fit(X_train_array_clf, y_train_clf)
    
    # Test 
    y_proba_clf = clf.predict_proba(X_test_array_clf)                        
    y_pred_clf = labels[np.argmax(y_proba_clf, axis=1)]                      
    error = 1 - accuracy_score(y_test_clf, y_pred_clf)                       
    print('error = %s' % error)                                                                            
    print('classification report:\n %s' % classification_report(y_test_clf, y_pred_clf))
    print('confusion matrix:\n %s' % confusion_matrix(y_test_clf, y_pred_clf))
    

skf = ShuffleSplit(n_splits=2, test_size=0.2, random_state=57)  
skf_is = list(skf.split(X_df))[0]

train_test_model_clf(X_df, y_df, skf_is, FeatureExtractorClf, Classifier)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 20.7min finished


Best params for XGBoost: 
	clf: XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=20,
       min_child_weight=1, missing=None, n_estimators=300, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)
	clf__base_score: 0.5
	clf__colsample_bylevel: 1
	clf__colsample_bytree: 1
	clf__gamma: 0
	clf__learning_rate: 0.05
	clf__max_delta_step: 0
	clf__max_depth: 20
	clf__min_child_weight: 1
	clf__missing: None
	clf__n_estimators: 300
	clf__nthread: -1
	clf__objective: 'multi:softprob'
	clf__reg_alpha: 0
	clf__reg_lambda: 1
	clf__scale_pos_weight: 1
	clf__seed: 0
	clf__silent: True
	clf__subsample: 1
	pca: PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)
	pca__copy: True
	pca__iterated_power: 'auto'
	pca__n_components: 10
	pca__random_state: None
	pca__svd_solver: 