# Libraries

In [3]:
%matplotlib notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.io import loadmat
from scipy.fft import fft, fftshift
import pywt

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import classification_report

# 1st Model

In [2]:

### Producing features for each person

for k in range(1,10):
    
    # Train data for person k
    train = pd.DataFrame(np.array(loadmat('TrainData' + str(k))['TrainData' + str(k)]))
    
    # Features matrix X_train
    X_train = []
    
    # Finding the times when the 10th row is not zero
    for i in np.argwhere(train.to_numpy()[9] != 0).reshape(-1, 4)[:,0]:
        
        # Producing features
        x = train.iloc[1 : 9, i-28 : i+100].to_numpy()
        x_train = np.append(np.array([]), x)
        x_train = np.append(x_train, x.mean(axis=1))
        x_train = np.append(x_train, x.var(axis=1))
        x_train = np.append(x_train, np.corrcoef(x)[np.triu_indices(8, k=1)])
        x_train = np.append(x_train, np.array(pywt.dwt(x, 'db1')))
        x_train = np.append(x_train, abs(fftshift(fft(x)))[:, :64] ** 2)
        X_train.append(x_train)
        
    X_train = np.array(X_train)
    
    # Train Labels y
    y_train = train.iloc[10, np.argwhere(train.to_numpy()[9] != 0).reshape(-1, 4)[:,0]].to_numpy()
    
    
    # Fisher-score
    best_features = np.array([])
    for i in range(len(X_train.T)):
        
        u0 = X_train.T[i].mean()
        
        n1 = (y_train == 0).sum()
        u1 = X_train.T[i][y_train == 0].mean()
        sigma2_1 = X_train.T[i][y_train == 0].var()
        
        n2 = (y_train == 1).sum()        
        u2 = X_train.T[i][y_train == 1].mean()        
        sigma2_2 = X_train.T[i][y_train == 1].var()
        
        best_features = np.append(best_features, (n1*(u1-u0)**2 + n2*(u2-u0)**2) / (n1*sigma2_1 + n2*sigma2_2))
    best_features = np.argsort(best_features)    
    
       
    
    
    
    
    # Test data for person k
    test = pd.DataFrame(np.array(loadmat('TestData' + str(k))['TestData' + str(k)]))
    
    # Features matrix X_test
    X_test = []
    
    # Finding the times when the 10th row is not zero
    for i in np.argwhere(test.to_numpy()[9] != 0).reshape(-1,4)[:,0]:        
        row = np.array([])
        
        # Producing features
        x = test.iloc[1 : 9, i-28 : i+100].to_numpy()
        x_test = np.append(np.array([]), x)
        x_test = np.append(x_test, x.mean(axis=1))
        x_test = np.append(x_test, x.var(axis=1))
        x_test = np.append(x_test, np.corrcoef(x)[np.triu_indices(8, k=1)])
        x_test = np.append(x_test, np.array(pywt.dwt(x, 'db1')))
        x_test = np.append(x_test, abs(fftshift(fft(x)))[:, :64] ** 2)
        X_test.append(x_test)
        
    X_test = np.array(X_test)
    
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    # The number which is shown in the 10th row
    letter_codes = test.iloc[9,np.argwhere(test.to_numpy()[9] != 0).reshape(-1,4)[:,0]].to_numpy()    
    
    # Zeros to ones ratio: It is 35 for the aprroach "Single Character"
    z21 = (y_train == 0).sum() // (y_train == 1).sum()
    
    if z21 == 35:
        print(k,'is','SC')
    else:
        print(k,'is','RC')

    
    f1score = 0
    # Max number of features will be 100
    for i in range(1,100):
        
        x_train = X_train[:,best_features[-i:]]
        x_train_train, x_train_test, y_train_train, y_train_test = train_test_split(x_train,y_train, test_size=0.5, random_state=0)
        
        clf = LogisticRegression(solver='newton-cg', class_weight = {0:1,1:z21})
        clf.fit(x_train_train,y_train_train)
        
        new_score = f1_score(y_train_test, clf.predict(x_train_test), average='weighted')
        
        if new_score > f1score:
            num = i
            f1score = new_score
            report = classification_report(y_train_test, clf.predict(x_train_test), digits=3)
            
    print(report)
    
    X_train = X_train[:, best_features[-num:]]
    X_test = X_test[:, best_features[-num:]]
    
    clf = LogisticRegression(solver='newton-cg', class_weight={0:1,1:z21})
    clf.fit(X_train, y_train)
    
    print('Test Output is')
    
    letters_lst = np.array(['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','0','1','2','3','4','5','6','7','8','9']).reshape((6,6))
    
    if z21 == 35:
        for i in range(5):
            code = np.unique(letter_codes[540*i : 540*(i+1)][clf.predict(X_test)[540*i : 540*(i+1)] == 1], return_counts=True)
            print(letters_lst.flatten()[int(code[0][np.argsort(code[1])][-1])-1], end='')
    else:
        for i in range(5):
            code = letter_codes[180*i : 180*(i+1)][clf.predict(X_test)[180*i : 180*(i+1)] == 1]
            code1 = np.unique(code[code < 7], return_counts=True)
            code2 = np.unique(code[code >= 7], return_counts=True)
            row = int(code2[0][np.argsort(code2[1])][-1]) - 7
            column = int(code1[0][np.argsort(code1[1])][-1])-1
            print(letters_lst[row, column], end='')
    print() 
    print('--------')

1 is SC
              precision    recall  f1-score   support

         0.0      0.980     0.840     0.905      1315
         1.0      0.058     0.371     0.101        35

    accuracy                          0.828      1350
   macro avg      0.519     0.606     0.503      1350
weighted avg      0.957     0.828     0.884      1350

Test Output is
GZVOU
--------
2 is SC
              precision    recall  f1-score   support

         0.0      0.988     0.818     0.895      1315
         1.0      0.084     0.629     0.149        35

    accuracy                          0.813      1350
   macro avg      0.536     0.723     0.522      1350
weighted avg      0.965     0.813     0.876      1350

Test Output is
LUKAS
--------
3 is RC
              precision    recall  f1-score   support

         0.0      0.882     0.733     0.801       378
         1.0      0.257     0.486     0.337        72

    accuracy                          0.693       450
   macro avg      0.570     0.609     0.569 

# 2nd Model

In [4]:

### Producing features for each person

for k in range(1,10):
    
    # Train data for person k
    train = pd.DataFrame(np.array(loadmat('TrainData' + str(k))['TrainData' + str(k)]))
    
    # Features matrix X_train
    X_train = []
    
    # Finding the times when the 10th row is not zero
    for i in np.argwhere(train.to_numpy()[9] != 0).reshape(-1, 4)[:,0]:
        
        # Producing features
        x = train.iloc[1 : 9, i-28 : i+100].to_numpy()
        x_train = np.append(np.array([]), x)
        x_train = np.append(x_train, x.mean(axis=1))
        x_train = np.append(x_train, x.var(axis=1))
        x_train = np.append(x_train, np.corrcoef(x)[np.triu_indices(8, k=1)])
        x_train = np.append(x_train, np.array(pywt.dwt(x, 'db1')))
        x_train = np.append(x_train, abs(fftshift(fft(x)))[:, :64] ** 2)
        X_train.append(x_train)
        
    X_train = np.array(X_train)
    
    # Train Labels y
    y_train = train.iloc[10, np.argwhere(train.to_numpy()[9] != 0).reshape(-1, 4)[:,0]].to_numpy()
    
    
    # Fisher-score
    best_features = np.array([])
    for i in range(len(X_train.T)):
        
        u0 = X_train.T[i].mean()
        
        n1 = (y_train == 0).sum()
        u1 = X_train.T[i][y_train == 0].mean()
        sigma2_1 = X_train.T[i][y_train == 0].var()
        
        n2 = (y_train == 1).sum()        
        u2 = X_train.T[i][y_train == 1].mean()        
        sigma2_2 = X_train.T[i][y_train == 1].var()
        
        best_features = np.append(best_features, (n1*(u1-u0)**2 + n2*(u2-u0)**2) / (n1*sigma2_1 + n2*sigma2_2))
    best_features = np.argsort(best_features)    
    
       
    
    
    
    
    # Test data for person k
    test = pd.DataFrame(np.array(loadmat('TestData' + str(k))['TestData' + str(k)]))
    
    # Features matrix X_test
    X_test = []
    
    # Finding the times when the 10th row is not zero
    for i in np.argwhere(test.to_numpy()[9] != 0).reshape(-1,4)[:,0]:        
        row = np.array([])
        
        # Producing features
        x = test.iloc[1 : 9, i-28 : i+100].to_numpy()
        x_test = np.append(np.array([]), x)
        x_test = np.append(x_test, x.mean(axis=1))
        x_test = np.append(x_test, x.var(axis=1))
        x_test = np.append(x_test, np.corrcoef(x)[np.triu_indices(8, k=1)])
        x_test = np.append(x_test, np.array(pywt.dwt(x, 'db1')))
        x_test = np.append(x_test, abs(fftshift(fft(x)))[:, :64] ** 2)
        X_test.append(x_test)
        
    X_test = np.array(X_test)
    
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    # The number which is shown in the 10th row
    letter_codes = test.iloc[9,np.argwhere(test.to_numpy()[9] != 0).reshape(-1,4)[:,0]].to_numpy()    
    
    # Zeros to ones ratio: It is 35 for the aprroach "Single Character"
    z21 = (y_train == 0).sum() // (y_train == 1).sum()
    
    if z21 == 35:
        print(k,'is','SC')
    else:
        print(k,'is','RC')

    
    f1score = 0
    # Max number of features will be 100
    for i in range(1,100):
        
        x_train = X_train[:,best_features[-i:]]
        x_train_train, x_train_test, y_train_train, y_train_test = train_test_split(x_train,y_train, test_size=0.5, random_state=0)
        
        clf = clf = svm.SVC(C=0.1,kernel='linear',class_weight={0:1,1:z21})
        clf.fit(x_train_train,y_train_train)
        
        new_score = f1_score(y_train_test, clf.predict(x_train_test), average='weighted')
        
        if new_score > f1score:
            num = i
            f1score = new_score
            report = classification_report(y_train_test, clf.predict(x_train_test), digits=3)
            
    print(report)
    
    X_train = X_train[:, best_features[-num:]]
    X_test = X_test[:, best_features[-num:]]
    
    clf = svm.SVC(C=0.1,kernel='linear',class_weight={0:1,1:z21})
    clf.fit(X_train, y_train)
    
    print('Test Output is')
    
    letters_lst = np.array(['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','0','1','2','3','4','5','6','7','8','9']).reshape((6,6))
    
    if z21 == 35:
        for i in range(5):
            code = np.unique(letter_codes[540*i : 540*(i+1)][clf.predict(X_test)[540*i : 540*(i+1)] == 1], return_counts=True)
            print(letters_lst.flatten()[int(code[0][np.argsort(code[1])][-1])-1], end='')
    else:
        for i in range(5):
            code = letter_codes[180*i : 180*(i+1)][clf.predict(X_test)[180*i : 180*(i+1)] == 1]
            code1 = np.unique(code[code < 7], return_counts=True)
            code2 = np.unique(code[code >= 7], return_counts=True)
            row = int(code2[0][np.argsort(code2[1])][-1]) - 7
            column = int(code1[0][np.argsort(code1[1])][-1])-1
            print(letters_lst[row, column], end='')
    print() 
    print('--------')

1 is SC
              precision    recall  f1-score   support

         0.0      0.985     0.802     0.884      1315
         1.0      0.068     0.543     0.121        35

    accuracy                          0.795      1350
   macro avg      0.526     0.672     0.502      1350
weighted avg      0.961     0.795     0.864      1350

Test Output is
7EV4Y
--------
2 is SC
              precision    recall  f1-score   support

         0.0      0.988     0.809     0.890      1315
         1.0      0.081     0.629     0.143        35

    accuracy                          0.804      1350
   macro avg      0.534     0.719     0.516      1350
weighted avg      0.964     0.804     0.870      1350

Test Output is
LUKAS
--------
3 is RC
              precision    recall  f1-score   support

         0.0      0.880     0.717     0.790       378
         1.0      0.246     0.486     0.327        72

    accuracy                          0.680       450
   macro avg      0.563     0.602     0.559 