# Introduction
 - Raman has been used to classify mineral types and has growing use in biomedical applications
 - Traditional methods rely on baseline normalization and PCA to extract features
 - Recent work has shown that CNN's can outperform manual baseline normalization techniques
 - This project is to train a CNN to process raw Raman spectra from the RUFF database and apply transfer learning to process raw raman from other applications

In [16]:
%load_ext autoreload
%autoreload 1

from lib.utils import RamanSample
import numpy as np
import pandas as pd
import csv
import pickle

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
path = "data/processed/spectra_with_labels.csv"
path = "data/processed/sample.csv"

my_data = pd.read_csv(path).as_matrix()
y, X = my_data[:, 0], my_data[:, 1:]


['Brucite' 'Dufrenoysite' 'Ferrierite-K' 'Linarite' 'Lorenzenite'
 'Stellerite' 'Trona' 'Wakabayashilite' 'Zussmanite' 'Franckeite'] [[0.183102 0.060612001 0.053993002000000005 0.193037 0.29520799999999997
  0.27611399999999997 0.123426 0.072893001 0.21088600000000002 0.2897]
 [1896.963 1894.661 1902.741 1899.266 1889.674 1878.457 1868.48
  1859.2879999999998 1848.5479999999998 1826.7839999999999]
 [0.108215 0.8623049999999999 0.68576 0.337219 0.27920500000000004
  0.50058 0.6892699999999999 0.588196 0.308655 0.15329]
 [14.39701 14.91875 15.628789999999999 17.03975 18.77167
  20.287470000000003 21.048779999999997 23.19223 24.818379999999998
  26.434279999999998]
 [159.3792 157.3322 155.2488 153.2598 151.6223 150.3969 149.3085 148.1007
  146.7592 145.5281]
 [100.9706 98.60374 96.03226 93.30836 90.44233 87.32395 83.93369 80.62957
  77.80696 72.89535]
 [47.27539 43.613279999999996 41.32812 41.94531 45.21094 48.17969
  44.29492 43.60938 62.93164 97.57616999999999]
 [314.002 331.4 334.523 2

In [73]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_labels = len(set(y_encoded))
one_hot_encoder = OneHotEncoder(sparse=True)
y_train = one_hot_encoder.fit_transform(y_encoded.reshape(-1,1))
X_train = np.expand_dims(X, axis=2)


2203

In [80]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv1D, MaxPooling1D, AveragePooling1D, Flatten, LeakyReLU, Dropout, GlobalAveragePooling1D
from keras.callbacks import ModelCheckpoint

In [74]:
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=4, padding='same', activation='relu', input_shape=(X_train.shape[1:])))
model.add(MaxPooling1D(pool_size=2, padding='valid'))
#model.add(LeakyReLU(alpha=.2))
model.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2, padding='valid'))
#model.add(LeakyReLU(alpha=.2))
model.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))
#model.add(LeakyReLU(alpha=.2))
model.add(MaxPooling1D(pool_size=2, padding='valid'))
model.add(GlobalAveragePooling1D())
model.add(Dense(500, activation='relu'))
model.add(Dropout(.5))
model.add(Dense(num_labels, activation="softmax"))

model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_33 (Conv1D)           (None, 2000, 64)          320       
_________________________________________________________________
max_pooling1d_26 (MaxPooling (None, 1000, 64)          0         
_________________________________________________________________
conv1d_34 (Conv1D)           (None, 1000, 32)          8224      
_________________________________________________________________
max_pooling1d_27 (MaxPooling (None, 500, 32)           0         
_________________________________________________________________
conv1d_35 (Conv1D)           (None, 500, 32)           4128      
_________________________________________________________________
max_pooling1d_28 (MaxPooling (None, 250, 32)           0         
_________________________________________________________________
global_average_pooling1d_7 ( (None, 32)                0         
__________

In [81]:
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
checkpoint = ModelCheckpoint("models/raman.s", save_best_only=True)

In [83]:
epochs=5
batch_size=500

model.fit(X_train[:1000, :], y_train[:1000], epochs=epochs, batch_size=batch_size, 
          verbose=1, validation_split=0.1, shuffle=True,
          callbacks=[checkpoint]
         )

Train on 900 samples, validate on 100 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1167e3a58>