## Linear Regression on Bank Notes Dataset


## Description of the data:

Data was extracted from images that were taken from genuine and forged banknote-like specimens. For digitization, an industrial camera usually used for print inspection was used. The final images have 400x 400 pixels. Due to the object lens and distance to the investigated object gray-scale pictures with a resolution of about 660 dpi were gained. Wavelet Transform tool were used to extract features from images.

# Reading in data

In [1]:
from symbol import import_as_name
import numpy as np
import os
import pandas as pd
import sys
import sklearn.metrics




dataset = pd.read_csv("C:/Users/teddy/Downloads/Machine Learning/Perceptron HW/BankNote_Authentication.csv")

dataset.tail()

  from symbol import import_as_name


Unnamed: 0,variance,skewness,curtosis,entropy,class
1367,0.40614,1.3492,-1.4501,-0.55949,1
1368,-1.3887,-4.8773,6.4774,0.34179,1
1369,-3.7503,-13.4586,17.5932,-2.7771,1
1370,-3.5637,-8.3827,12.393,-1.2823,1
1371,-2.5419,-0.65804,2.6842,1.1952,1


# Divide the data into training and testing sets

In [60]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import numpy as np

X = dataset.iloc[:, [0, 2]]
y = dataset.iloc[:,4]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=1)

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

sc_y = StandardScaler()
sc_y.fit(y_train[:, np.newaxis])
y_train_std = sc_y.transform(y_train[:, np.newaxis]).flatten()
y_test_std = sc_y.transform(y_test[:, np.newaxis]).flatten()


  sc_y.fit(y_train[:, np.newaxis])
  y_train_std = sc_y.transform(y_train[:, np.newaxis]).flatten()
  y_test_std = sc_y.transform(y_test[:, np.newaxis]).flatten()


# Training sklearn's LinearRegression on the Bank Notes dataset

In [61]:


from sklearn.linear_model import LinearRegression

est = LinearRegression()  
est.fit(X_train_std, y_train_std)


y_train_pred = est.predict(X_train_std)
y_test_pred = est.predict(X_test_std)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train_std, y_train_pred),
        mean_squared_error(y_test_std, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train_std, y_train_pred),
        r2_score(y_test_std, y_test_pred)))

MSE train: 0.438, test: 0.488
R^2 train: 0.562, test: 0.508


# Training gplearn on the Bank Notes dataset

In [62]:
#%pip install gplearn

In [63]:

from gplearn.genetic import SymbolicRegressor

est = SymbolicRegressor(population_size=1000,
                        init_depth=(4,6),
                        generations=100, stopping_criteria=0.01,
                        p_crossover=0.3, p_subtree_mutation=0.35,
                        p_hoist_mutation=0.0, p_point_mutation=0.35,
                        max_samples=1.0, verbose=1,
                        #const_range=None,
                        const_range=(-1.0,1.0),
                        tournament_size=5,
                        function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 
                                      'abs', 'neg', 'inv', 'max','min', 'sin', 'cos', 'tan'),
                        parsimony_coefficient=0.0001, random_state=0)
est.fit(X_train_std, y_train_std)

y_train_pred = est.predict(X_train_std)
y_test_pred = est.predict(X_test_std)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train_std, y_train_pred),
        mean_squared_error(y_test_std, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train_std, y_train_pred),
        r2_score(y_test_std, y_test_pred)))

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    17.07          6.61563       32         0.537025              N/A     54.17s
   1    13.61          1.60768        6         0.505146              N/A     55.26s
   2    14.37          148.285        3         0.481675              N/A     52.84s
   3    16.16          14.5099        4         0.478381              N/A     59.27s
   4    17.24          1.06745        9         0.434406              N/A     54.97s
   5    18.13          1.58507       19         0.418669              N/A     50.97s
   6    19.19          1.18485       35         0.431977              N/A      1.21m
   7    20.65          1.41678       27         0.415703              N/A      1.31m
   8    19.68          2.05735        8         0.405458              N/A  

# Training gplearn on the artificial dataset

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=1)

##################

from gplearn.genetic import SymbolicRegressor

est = SymbolicRegressor(population_size=1000,
                        init_depth=(4,6),
                        generations=100, stopping_criteria=0.01,
                        p_crossover=0.3, p_subtree_mutation=0.35,
                        p_hoist_mutation=0.0, p_point_mutation=0.35,
                        max_samples=1.0, verbose=1,
                        const_range=None,
                        #const_range=(-1.0,1.0),
                        tournament_size=5,
                        function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 
                                      'abs', 'neg', 'inv', 'max','min', 'sin', 'cos', 'tan'),
                        parsimony_coefficient=0.0001, random_state=0)
est.fit(X_train, y_train)

##################

y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    17.38          22.1151       10         0.443625              N/A      1.13m
   1    13.62          6.92078       18         0.250898              N/A      1.40m
   2    13.35          1.45073       18         0.250898              N/A      1.29m
   3    14.03          1.51288       19         0.244336              N/A      1.11m
   4    13.64          1.37187       22         0.216357              N/A      1.15m
   5    13.82          1.61913       34         0.222307              N/A      1.11m
   6    17.93          1.23661       18         0.222307              N/A      1.27m
   7    22.65         0.816092       23         0.185269              N/A      1.70m
   8    24.06         0.858898       17         0.176855              N/A  

##  Analysis

The gplearn has a higher R^2 model score than LinearRegression's showing that gplearn fits the data beter.However gplearn has a higher difference with it's test R^2 score showinng that it slightly overfits.