# Enginnering

### Feature Engineering - Row

We calculate the extra values for each row and then delete the 256 readings.


In [25]:
import pandas as pd


def engineering_rows(filename):
    column_names = ["Id", "Alcoholic", "Paradigm", "Replication", "Channel"]
    for reading in range(256):
        column_names.append("Reading " + str(reading+1))
    data = pd.read_csv(filename, sep=" ", header=None, names=column_names)

    reading_columns = []
    for reading in range(256):
        reading_columns.append("Reading " + str(reading + 1))
    data["Min"] = data[reading_columns].min(axis=1)
    data["Max"] = data[reading_columns].max(axis=1)
    data["Std"] = data[reading_columns].std(axis=1)
    data["Mean"] = data[reading_columns].mean(axis=1)
    data["Median"] = data[reading_columns].median(axis=1)
    data["Quantile025"] = data[reading_columns].quantile(0.025, axis=1)
    data["Quantile25"] = data[reading_columns].quantile(0.25, axis=1)
    data["Quantile75"] = data[reading_columns].quantile(0.75, axis=1)
    data["Quantile975"] = data[reading_columns].quantile(0.975, axis=1)
    data["IQR"] = data["Quantile75"] - data["Quantile25"]

    return data.drop(columns=reading_columns)


data_rows = engineering_rows("results/co2a0000364.txt")
data_rows

Unnamed: 0,Id,Alcoholic,Paradigm,Replication,Channel,Min,Max,Std,Mean,Median,Quantile025,Quantile25,Quantile75,Quantile975,IQR
0,co2a0000364,a,S1obj,0,FP1chan0,-13.316,19.887,6.707825,4.113535,4.7510,-8.738000,-0.62100,8.16900,16.958000,8.79000
1,co2a0000364,a,S1obj,0,FP2chan1,-14.303,24.760,7.977130,3.819156,4.7400,-12.471625,-1.60700,8.64700,20.854000,10.25400
2,co2a0000364,a,S1obj,0,F7chan2,-30.589,31.423,9.714502,5.696625,5.7880,-12.827625,-0.31500,12.86800,22.939000,13.18300
3,co2a0000364,a,S1obj,0,F8chan3,-19.684,27.679,10.524552,2.187758,0.8240,-15.594625,-6.50000,10.10100,20.965625,16.60100
4,co2a0000364,a,S1obj,0,AF1chan4,-8.494,15.432,4.672660,3.346762,3.7130,-5.869625,0.17300,6.76500,11.525000,6.59200
5,co2a0000364,a,S1obj,0,AF2chan5,-11.078,14.801,4.875914,3.770840,4.0590,-7.110625,0.64100,6.98900,12.360000,6.34800
6,co2a0000364,a,S1obj,0,FZchan6,-6.419,9.694,3.341052,2.316797,2.3700,-3.977000,-0.07100,4.81200,8.535000,4.88300
7,co2a0000364,a,S1obj,0,F4chan7,-7.823,14.638,4.621874,2.215707,1.9430,-5.870000,-0.98700,5.36100,10.732000,6.34800
8,co2a0000364,a,S1obj,0,F3chan8,-7.416,17.487,4.747063,4.911406,5.5240,-3.510000,0.88500,7.84300,15.045000,6.95800
9,co2a0000364,a,S1obj,0,FC6chan9,-10.305,15.086,5.054547,1.816441,1.4140,-7.375000,-1.51600,5.32000,12.644000,6.83600


Using the reduction by rows, we mantain the same amount of rows but each row now only has 15 columns.

Then we save the data into another file, appending the data instead of overriting.

In [26]:
import os

filepath = "engineered/test_alcoholic_rows.csv"

if not os.path.exists("engineered"):
    os.makedirs("engineered")


data_rows.to_csv(filepath, header=False, index=False, mode="a")

### Feature Engineering - Columns

We group the readings using the "Id", "Alcoholic", "Paradigm" and "Channel" and make the mean of all the readings.


In [27]:
import pandas as pd


def engineering_columns(filename):
    column_names = ["Id", "Alcoholic", "Paradigm", "Replication", "Channel"]
    for reading in range(256):
        column_names.append("Reading " + str(reading+1))
    data = pd.read_csv(filename, sep=" ", header=None, names=column_names)

    reading_columns = []
    for reading in range(256):
        reading_columns.append("Reading " + str(reading + 1))
    mean_table = data.groupby(["Id", "Alcoholic", "Paradigm", "Channel"])[reading_columns].mean()

    return mean_table.reset_index()


data_columns = engineering_columns("results/co2a0000364.txt")
data_columns

Unnamed: 0,Id,Alcoholic,Paradigm,Channel,Reading 1,Reading 2,Reading 3,Reading 4,Reading 5,Reading 6,...,Reading 247,Reading 248,Reading 249,Reading 250,Reading 251,Reading 252,Reading 253,Reading 254,Reading 255,Reading 256
0,co2a0000364,a,S1obj,AF1chan4,-0.671778,-0.611457,-0.557111,-0.569358,-0.484889,-0.448716,...,4.000099,3.867420,3.993975,4.036160,3.692679,3.276568,2.963185,2.981346,3.385173,3.752877
1,co2a0000364,a,S1obj,AF2chan5,-0.216185,-0.258358,-0.427086,-0.372864,-0.077519,0.127494,...,3.111370,2.755704,2.773790,2.990815,3.189790,3.135519,2.954704,2.858247,3.045025,3.412877
2,co2a0000364,a,S1obj,AF7chan32,-2.698914,-1.469185,0.098049,1.110852,1.038506,0.152358,...,7.512741,7.440358,7.440407,7.235444,6.716926,6.035889,5.505346,5.649951,6.144247,6.632617
3,co2a0000364,a,S1obj,AF8chan33,-0.272395,-1.405654,-1.580457,-0.694370,0.444914,0.577556,...,7.847506,6.430901,4.278951,2.693457,2.223210,2.934654,4.122123,5.381988,6.352654,6.804667
4,co2a0000364,a,S1obj,AFZchan47,-0.453519,-0.459543,-0.513815,-0.344914,-0.133988,-0.043519,...,3.531037,3.314049,3.338321,3.422716,3.416543,3.308074,3.103086,2.940346,3.024728,3.229667
5,co2a0000364,a,S1obj,C1chan52,0.751173,0.082037,0.202642,0.292988,0.968136,0.220605,...,-0.701667,-0.376074,-0.400185,-0.466580,-1.828963,-0.008309,-0.496667,-0.321889,-0.520827,0.057864
6,co2a0000364,a,S1obj,C2chan53,0.399321,-0.348173,-0.161247,-0.342136,0.091926,0.091963,...,0.315037,0.236605,0.146136,0.152210,-0.884679,-0.113074,0.327025,-0.058778,0.809210,-0.185370
7,co2a0000364,a,S1obj,C3chan16,0.727037,-0.086716,0.443704,-0.231469,1.010321,-0.894580,...,-1.804778,-0.755914,-0.490630,0.033728,-0.719667,-0.147074,-1.177852,-0.538988,-1.491370,0.311074
8,co2a0000364,a,S1obj,C4chan17,-0.293753,-0.956852,-0.504790,-0.993000,-0.402222,0.230691,...,0.532160,0.688877,-0.046679,-0.360062,1.562901,-0.745827,1.454407,0.134321,1.575012,-0.872457
9,co2a0000364,a,S1obj,C5chan42,0.182074,0.121914,0.224309,0.495654,0.616148,0.242358,...,-2.343679,-1.939864,-1.041580,-0.493086,-0.872802,-1.668531,-2.126691,-1.837321,-1.138136,-0.824580


Using the reduction by columns, we greatly reduce the amount of rows. But we mantain the same number of columns, excluding the one that counted the repetition of the experiment.

Then we save the data into another file, appending the data instead of overriting.

In [28]:
filepath = "engineered/test_alcoholic_columns.csv"

if not os.path.exists("engineered"):
    os.makedirs("engineered")


data_columns.to_csv(filepath, header=False, index=False, mode="a")