In [1]:
%matplotlib inline
import os
import glob
import numpy as np
from scipy import io
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
data = pd.read_csv('train.csv')

y_df = data[['molecule', 'concentration']]
X_df = data.drop(['molecule', 'concentration'], axis=1)
spectra = X_df['spectra'].values                                        
spectra = np.array([np.array(dd[1:-1].split(',')).astype(float) for dd in spectra])    
X_df['spectra'] = spectra.tolist()

# Loading wavenumbers
freqs = pd.read_csv('freq.csv')
freqs = freqs['freqs'].values

# Target for classification
molecule = y_df['molecule'].values
# Target for regression
concentration = y_df['concentration'].values
# "Raw" features
X = spectra

In [3]:
import numpy as np
import pandas as pd
from scipy import signal as signal

class FeatureExtractorClf():
    def __init__(self, window_length=25, polyorder=4):
        self.window_length = window_length
        self.polyorder = polyorder

    def fit(self, X_df, y):
        return self

    def transform(self, X_df):
        XX = np.array([np.array(dd) for dd in X_df['spectra']])
        XX = signal.savgol_filter(XX, axis=1, window_length=self.window_length, polyorder=self.polyorder, mode='nearest')
        XX = signal.detrend(XX, axis=1)
        return XX


In [38]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import KernelPCA
 
class Classifier(BaseEstimator, TransformerMixin):
    def __init__(self, n_components=500, C=10000, gamma=10):
        self.n_components = n_components
        self.C = C
        self.gamma = gamma
        
    def fit(self, X, y):
        self.clf = Pipeline([
            ('pca', PCA(n_components=self.n_components)),
            ('clf', SVC(C=self.C, kernel='rbf', gamma=self.gamma, probability=True))
        ])
        self.clf.fit(X, y)
 
    def predict(self, X):
        return self.clf.predict(X)
 
    def predict_proba(self, X):
        prediction = self.clf.predict_proba(X)
        #prediction[:,1] *= 1.42 # Error on B predominant
        return prediction

In [39]:
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

labels = np.array(['A', 'B', 'Q', 'R'])

def train_test_model_clf(X_df, y_df, skf_is, FeatureExtractor, Classifier):
    train_is, test_is = skf_is
    X_train_df = X_df.iloc[train_is].copy()                                  
    y_train_df = y_df.iloc[train_is].copy()
    y_train_clf = y_train_df['molecule'].values
    X_test_df = X_df.iloc[test_is].copy()                                    
    y_test_df = y_df.iloc[test_is].copy() 
    y_test_clf = y_test_df['molecule'].values 
    # Feature extraction
    fe_clf = FeatureExtractor()
    fe_clf.fit(X_train_df, y_train_df)
    X_train_array_clf = fe_clf.transform(X_train_df)
    X_test_array_clf = fe_clf.transform(X_test_df)
    # Train
    
    clf = Classifier()#randomForest
    clf.fit(X_train_array_clf, y_train_clf)
    
    # Test 
    y_proba_clf = clf.predict_proba(X_test_array_clf)                        
    y_pred_clf = labels[np.argmax(y_proba_clf, axis=1)]                      
    error = 1 - accuracy_score(y_test_clf, y_pred_clf)                       
    print('error = %s' % error)                                                                            
    print('classification report:\n %s' % classification_report(y_test_clf, y_pred_clf))
    print('confusion matrix:\n %s' % confusion_matrix(y_test_clf, y_pred_clf))
    

skf = ShuffleSplit(n_splits=2, test_size=0.2, random_state=57)  
skf_is = list(skf.split(X_df))[0]

train_test_model_clf(X_df, y_df, skf_is, FeatureExtractorClf, Classifier)

error = 0.025
classification report:
              precision    recall  f1-score   support

          A       0.98      0.97      0.98        63
          B       0.98      0.96      0.97        45
          Q       1.00      0.97      0.99        40
          R       0.95      1.00      0.97        52

avg / total       0.98      0.97      0.98       200

confusion matrix:
 [[61  1  0  1]
 [ 1 43  0  1]
 [ 0  0 39  1]
 [ 0  0  0 52]]
