# Simple Kotlin neural network for Auto-Mpg Dataset

In [1]:
// dependencies for KotlinDL
@file:Repository("https://kotlin.bintray.com/kotlin-datascience")
@file:DependsOn("org.jetbrains.kotlin-deeplearning:api:[0.1.1]")
// pandas-inspired library to work with data frames
%use krangl
// library for plotting
%use lets-plot
// these two libraries are already in kotlin jupyter kernel

### Data Prepoccesing

Raw auto-mpg dataset is not provided in any standard format. There are some tools for parsing it :

In [2]:
// path to file with dataset
val path = "data/auto-mpg.data"

In [3]:
fun parseDataString(string: String): List<Double>? {
    val splitString = string.split('\t')[0].split(Regex("( |\t|\n)+"))
    
    // ignore raws with any null values
    if ("?" in splitString) {
        return null
    }

    return splitString.map { it -> it.toString().toDouble() }
}

In [4]:
import java.io.File

val dataList = mutableListOf<Double>()

File(path).forEachLine { 
    val stringData = parseDataString(it)
    stringData?.let {
        it.forEach { dataList.add(it) }
    }
}

Create keras DataFrame :

In [5]:
val colNames = listOf(
    "mpg", 
    "cylinders", 
    "displacement", 
    "horsepower", "weight", 
    "acceleration", 
    "model year", 
    "origin"
)

val df: DataFrame = dataFrameOf(colNames)(dataList)

In [6]:
df.head()

mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0
15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0
18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0
16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0
17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0


Rearrange raws in random order :

In [7]:
val dfSample = df.sampleN(df.nrow)

### Data normalization

Normalize feature columns:

In [8]:
fun normalizeCol(col: DoubleCol): DataCol {
    return (col - col.mean()!!)/(col.sd()!!)
}

val dfNormed = dataFrameOf(
    columns = dfSample.cols.drop(1).map {
        normalizeCol (it as DoubleCol)
    } .toTypedArray()
)

dfNormed.head()

tmp_col_57495247-e86c-4614-b8fc-a40b2c827e9d,tmp_col_3fa703d1-4f6e-41c6-b230-faa718f3dffc,tmp_col_6762b213-cc7f-4faf-9e03-400e3768b0c7,tmp_col_e7d7274b-db03-4ece-bc93-f25faa1eca61,tmp_col_89a471b6-4d62-4c98-8eb4-977d3f19c982,tmp_col_e4e67842-c973-478f-a5bd-286e5a9053dc,tmp_col_f737e930-e3f6-4e99-862b-2393c2bcfede
1.483947024779763,2.483847544975299,3.0053200647098817,1.6225225584708765,-2.374051701637827,-1.6253153340187592,-0.7166410451853089
0.3099667347420896,0.6084368954910576,-0.246329007247021,0.2539327267932948,0.819743146646186,0.0055471513106429,-0.7166410451853089
0.3099667347420896,0.5318895220427213,-0.8446324364870911,0.2126746353990525,1.4367262423374154,-0.266263262910924,-0.7166410451853089
1.483947024779763,1.4983001118279682,1.1323701992627055,1.9796997496838888,-0.7408611542198661,-0.266263262910924,-0.7166410451853089
-0.8640135552955837,-0.520636862871904,-0.8446324364870911,-0.6714273230489975,1.255260625957642,-1.3535049197971922,-0.7166410451853089


### Neural network

First, let's put the data (features and targets) in the form `Array<FloatArray>`:

In [9]:
val x = dfNormed.rows.map {it.values.map {(it as Double).toFloat()}.toFloatArray() }.toTypedArray()
val y = dfSample.select("mpg").rows.map {it.values.map {(it as Double).toFloat()}.toFloatArray() }.toTypedArray()

Now we can create KotlinDL `Dataset` :

In [10]:
import org.jetbrains.kotlinx.dl.datasets.Dataset
val dataset = Dataset.create({x}, {y})

Split the dataset into a training set and a test set in a ratio of 9:1 :

In [11]:
val (train, test) = dataset.split(splitRatio = 0.9)

In [12]:
train.xSize()

352

In [13]:
test.xSize()

40

Create neural network model with 3 hidden dense layers : 

In [14]:
import org.jetbrains.kotlinx.dl.api.core.Sequential
import org.jetbrains.kotlinx.dl.api.core.layer.*
import org.jetbrains.kotlinx.dl.api.core.activation.Activations

val model = Sequential.of(
    Input(7),
    Dense(64), //by default activation function set to ReLU
    Dense(64),
    Dense(1, activation=Activations.Linear)
)

Compile it ...

In [15]:
import org.jetbrains.kotlinx.dl.api.core.loss.Losses
import org.jetbrains.kotlinx.dl.api.core.metric.Metrics
import org.jetbrains.kotlinx.dl.api.core.optimizer.RMSProp

model.compile(
    optimizer = RMSProp(),
    loss = Losses.MAE,
    metric = Metrics.MAE
)

In [16]:
model.summary()

[dense_1(Dense)               [None, 64]                512, dense_2(Dense)               [None, 64]                4160, dense_3(Dense)               [None, 1]                 65]

... and fit with train data :

In [17]:
model.fit(
    dataset = train,
    epochs = 100,
    batchSize = 32,
)

org.jetbrains.kotlinx.dl.api.core.history.TrainingHistory@10f6f870

Check the test set performance :

In [18]:
val mae = model.evaluate(
    dataset = test,
    batchSize = 32
).metrics[Metrics.MAE]

"MAE: $mae"

MAE: 1.6794047355651855

Take a look at the error distribution. Draw histogram with lets-plot :

In [19]:
val testSize = test.xSize()
val xTest = x.takeLast(testSize)
val yTest = y.takeLast(testSize)

val errList = mutableListOf<Float>()

xTest.zip(yTest).forEach { (x, y) ->
    val pred = model.predictSoftly((x))[0]
    errList.add(pred - y[0])
}

val err = mapOf("Absoulte error" to errList)

val hist = lets_plot(err) { x = "Absoulte error" } + ggsize(500, 250)
hist + geom_histogram(binWidth=0.5)