### Extract year from one file

In [4]:
import re

def get_year(filename):
    year = re.findall('(\d{4}).*', filename)[0]
    year = int(year)
    return year

filename = 'PENNIES/1960_s/1960PennyLincolnUp.LampE5.5.200Scan.07172019.P1.ChangedAngle_HRD10591_13-10-22-820.txt'
get_year(filename)    

1960

### Extract data from one file

In [5]:
import pandas as pd

def get_data(filename):
    df = pd.read_csv(filename, skiprows=13, sep='\t', names=['freq', 'intensity'])
    df = df.set_index('freq')
    return df

df = get_data(filename)
df.head()

Unnamed: 0_level_0,intensity
freq,Unnamed: 1_level_1
223.165,-9
223.4,-9
223.635,-9
223.869,-9
224.104,-7


### Process all files

In [6]:
from pathlib import Path

intensities = []
years = []

for filename in Path('PENNIES').glob('**/*.txt'):
    year = get_year(filename.name)
    years.append(year)
    df = get_data(filename)
    intensities.append(df['intensity'])

In [7]:
df = pd.concat(intensities, axis=1).transpose()
df.shape, len(years)

((6210, 2048), 6210)

### Train and test models 

In [1]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

classifiers = [
    KNeighborsClassifier(3),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(n_estimators=100),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel="linear", C=0.025),
]

X = df.values
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = years
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)
for clf in classifiers:
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    name = clf.__class__.__name__
    print(name, score, np.mean(cross_val_score(clf, X_test, y_test)))

NameError: name 'df' is not defined

### Fine tune MLPClassifier

In [21]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

X = df.values
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = years
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)

mlp = MLPClassifier()
parameter_space = {'max_iter': [500,1000,1500], 
                   'alpha': 10.0 ** -np.arange(1, 7), 
                   'hidden_layer_sizes':np.arange(5, 12),}

grid_clf_acc = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3, scoring='accuracy')
grid_clf_acc.fit(X_train, y_train)

print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_)
print('Grid best score (accuracy): ', grid_clf_acc.best_score_)

print("Finished")

Grid best parameter (max. accuracy):  {'alpha': 0.0001, 'hidden_layer_sizes': 11, 'max_iter': 1500}
Grid best score (accuracy):  0.9428341384863124
Finished


In [None]:
clf = MLPClassifier(activation='tanh', alpha=0.0001, 
                    hidden_layer_sizes=(11,), 
                    learning_rate='constant', 
                    solver='adam', max_iter=1500)

clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
name = clf.__class__.__name__
print(name, score)

### Fine tune SVC (optimal: 'kernel':'linear', 'C':0.01)

In [5]:
print("Running")
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

X = df.values
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = years
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)

#best value was linear
#grid_values = {'kernel':['linear', 'poly’'', 'rbf', 'sigmoid']}

#best value was 0.01
#grid_values = {'C':[0.001, 0.0025, 0.01, 0.025, 0.1, 0.25, 1, 2.5, 10]}

#best value was 0.01
#grid_values = {'C':[0.001, 0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075]}

clf = SVC(kernel="linear")
grid_clf_acc = GridSearchCV(clf, param_grid=grid_values, scoring='accuracy')
grid_clf_acc.fit(X_train, y_train)

print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_)
print('Grid best score (accuracy): ', grid_clf_acc.best_score_)

print("Finished")

Running
Grid best parameter (max. accuracy):  {'C': 0.01}
Grid best score (accuracy):  0.9769189479334407
Finished


### Test a new file

In [None]:
print("test")
filename = 'PENNIES/1980_s/1986.PennyLincolnUp.Lamp.En5.5.200Scan.071819.P1_HRD10591_17-08-12-973.txt'
intensities = get_data(filename).transpose()
intensities = scaler.transform(intensities)
year = clf.predict(intensities)[0]
year