# Defining, Training, and Testing Models - Clustering

![ml workflow](https://docs.google.com/drawings/d/e/2PACX-1vQ1XLwesZbm_TuDBPFRvbHa4XcjucvtExy3LXE05WnaAw-s6BDVQnnd4lAEUW1Qy6bs6FythuJdFVqP/pub?w=1165&h=662)

Let's try out some **Clustering**. A cluster or group is a collection of "similar" data points, and a clustering algorithm takes our raw data and outputs these groups. In our example, we will be trying to cluster transportation fleet information from a `fleet_data.csv` CSV. The data represents fleet drivers (via a Driver ID) and the mean distance driven per driver. The data also includes a "speeding feature," which is a percentage of time during which the driver is driving 5+ miles per hour faster than the speed limit. 

## Imports

In [None]:
import (
    "fmt"
    "os"
    "strings"
    "encoding/csv"
    "image/color"
    "image"
    "image/png"
    "io/ioutil"
    "strconv"
    
    "gonum.org/v1/gonum/floats"
    "gonum.org/v1/plot"
    "gonum.org/v1/plot/plotter"
    "gonum.org/v1/plot/plotutil"
    "gonum.org/v1/plot/vg"
    "gonum.org/v1/plot/vg/draw"
    "github.com/kniren/gota/dataframe"
    "github.com/mash/gokmeans"
)

## Convenience Functions

In [None]:
// GetGraph returns the bytes corresponding to a
// saved plot.
func GetGraph(graphName string) ([]byte, error) {
    
    // Open the file.
    infile, err := os.Open(graphName)
    if err != nil {
        return nil, err
    }
    
    // Read in the contents of the file.
    bytes, err := ioutil.ReadAll(infile)
    if err != nil {
        return nil, err
    }
    
    // Close the file.
    infile.Close()
    
    return bytes, err
}

In [None]:
// graphScatterFromColumns creates a scatter plot from 2 columns,
// where the first column will be represented on the y axis.
func graphScatterFromColumns(columnA, columnB string, dataset dataframe.DataFrame){
    
    // Extract the y values. 
    yVals := dataset.Col(columnA).Float()
    
    // pts will hold the values for plotting.
    pts := make(plotter.XYs, dataset.Nrow())
    
    // Fill pts with data.
    for i, floatVal := range dataset.Col(columnB).Float() {
        pts[i].X = floatVal
        pts[i].Y = yVals[i]
    }
   
    // Create the plot.
    p, err := plot.New()
    if err != nil {
        fmt.Println(err)
    }
    p.X.Label.Text = columnB
    p.Y.Label.Text = columnA
    p.Add(plotter.NewGrid())
    s, err := plotter.NewScatter(pts)
    if err != nil {
        fmt.Println(err)
    }
    s.GlyphStyle.Color = color.RGBA{R: 255, B: 128, A: 255}
    s.GlyphStyle.Radius = vg.Points(3)
    
    // Save the plot to a PNG file.
    p.Add(s)
    if err := p.Save(4*vg.Inch, 4*vg.Inch, "scatter_"+columnA+"_vs_"+columnB+".png"); err !=
    nil {
        fmt.Println(err)
    }
}

## Import the Data

In [None]:
// Open the file.
file, err := os.Open("../data/fleet_data.csv")
if err != nil {
    fmt.Println(err)
}

// Read the CSV data into a dataframe.
dataset := dataframe.ReadCSV(file)

// Close the file.
file.Close()

// Examine the parsed data.
fmt.Println(dataset)

## Profile the Data

In [None]:
// Output the summary statistics for this data set.
fmt.Println(dataset.Describe())

In [None]:
// Create a histogram for each of the features in the dataset.
for _, colName := range dataset.Names() {
    
    // Don't bother with the driver ID.
    if colName != "Driver_ID" {
        
        // Create a plotter.Values value and fill it with the
        // values from the respective column of the dataframe.
        plotVals := make(plotter.Values, dataset.Nrow())
        for i, floatVal := range dataset.Col(colName).Float() {
            plotVals[i] = floatVal
        }

        // Make a plot and set its title.
        p, err := plot.New()
        if err != nil {
            fmt.Println("error creating plot",err)
        }
        p.Title.Text = fmt.Sprintf("Histogram of %s", colName)

        // Create a histogram of our values.
        h, err := plotter.NewHist(plotVals, 16)
        if err != nil {
            fmt.Println("error creating histogram",err)
        }

        // Normalize the histogram.
        h.Normalize(1)

        // Add the histogram to the plot.
        p.Add(h)

        // Save the plot to a PNG file.
        if err := p.Save(4*vg.Inch, 4*vg.Inch, colName+"_hist.png"); err != nil {
            fmt.Println("error saving",err)
        }
    }
}

In [None]:
graphCal, err :=GetGraph("Distance_Feature_hist.png")
if err != nil {
    fmt.Println(err)
}
display.PNG(graphCal)

In [None]:
graphCar, err :=GetGraph("Speeding_Feature_hist.png")
if err != nil {
    fmt.Println(err)
}
display.PNG(graphCar)

In [None]:
// Loop over the dataset creating scatter plots.
for _, colNameA := range dataset.Names() {
    for _, colNameB := range dataset.Names() {
        if colNameA != "Driver_ID" && colNameB != "Driver_ID" && colNameA != colNameB  {
            graphScatterFromColumns(colNameA, colNameB, dataset)
        }
    }
}

In [None]:
// Display one of the scatter plots.
graphCalVCar, err :=GetGraph("scatter_Distance_Feature_vs_Speeding_Feature.png")
if err != nil {
    fmt.Println(err)
}
display.PNG(graphCalVCar)

## Cluster the data with k-means

In [None]:
// Initialize a slice of gokmeans.Node's to
// hold our input data.
var data []gokmeans.Node

// Loop over the records creating our slice of
// gokmeans.Node's.
for _,record := range dataset.Select([]int{1,2}).Records(){
    
    // Read in our record and check for errors.
    if record[0] != "Distance_Feature"{
        
        // Initialize a point.
        var point []float64
        
        // Fill in our point.
        for i := 0; i < 2; i++ {
            
            // Parse the float value.
            val, err := strconv.ParseFloat(record[i], 64)
            if err != nil {
                fmt.Println(err)
            }
            
            // Append this value to our point.
            point = append(point, val)
        }
        
        // Append our point to the data.
        data = append(data, gokmeans.Node{point[0], point[1]})
    }
}

// Generate our clusters with k-means.
success, centroids := gokmeans.Train(data, 2, 50)
if !success {
   fmt.Println("Could not generate clusters")
}

// Output the centroids to stdout.
fmt.Println("The centroids for our clusters are:")
for _, centroid := range centroids {
    fmt.Println(centroid)
}

## Evaluating the generated clusters

### Evaluating the clusters visually

In [None]:
// Extract the distance column.
yVals := dataset.Col("Distance_Feature").Float()

// clusterOne and clusterTwo will hold the values for plotting.
var clusterOne [][]float64
var clusterTwo [][]float64

// Fill the clusters with data.
for i, xVal := range dataset.Col("Speeding_Feature").Float() {
    distanceOne := floats.Distance([]float64{yVals[i], xVal}, centroids[0], 2)
    distanceTwo := floats.Distance([]float64{yVals[i], xVal}, centroids[1], 2)
    if distanceOne < distanceTwo {
        clusterOne = append(clusterOne, []float64{yVals[i], xVal})
    }else{
        clusterTwo = append(clusterTwo, []float64{yVals[i], xVal})
    }
}

// pts* will hold the values for plotting
ptsOne := make(plotter.XYs, len(clusterOne))
ptsTwo := make(plotter.XYs, len(clusterTwo))

// Fill pts with data.
for i, point := range clusterOne {
    ptsOne[i].X = point[1]
    ptsOne[i].Y = point[0]
}
for i, point := range clusterTwo {
    ptsTwo[i].X = point[1]
    ptsTwo[i].Y = point[0]
}

// Create the plot.
p, err := plot.New()
if err != nil {
    fmt.Println(err)
}
p.X.Label.Text = "Speeding"
p.Y.Label.Text = "Distance"
p.Add(plotter.NewGrid())

sOne, err := plotter.NewScatter(ptsOne)
if err != nil {
    fmt.Println(err)
}
sOne.GlyphStyle.Radius = vg.Points(3)
sOne.Color = color.RGBA{255, 0, 0, 255}

sTwo, err := plotter.NewScatter(ptsTwo)
if err != nil {
    fmt.Println(err)
}
sTwo.GlyphStyle.Radius = vg.Points(3)

// Save the plot to a PNG file.
p.Add(sOne, sTwo)
if err := p.Save(4*vg.Inch, 4*vg.Inch, "fleet_data_clusters.png"); err !=nil {
    fmt.Println(err)
}

In [None]:
// Display the visual representation of the clusters.
graphCalVCar, err :=GetGraph("fleet_data_clusters.png")
if err != nil {
    fmt.Println(err)
}
display.PNG(graphCalVCar)

### Evaluating the clusters quantitatively 

In [None]:
// withinClusterMean calculates the mean distance between
// points in a cluster and the centroid of the cluster.
func withinClusterMean(cluster [][]float64, centroid []float64) float64 {

    // meanDistance will hold our result.
    var meanDistance float64
    
    // Loop over the points in the cluster.
    for _, point := range cluster {
        meanDistance += floats.Distance(point, centroid, 2) / float64(len(cluster))
    }

    return meanDistance
}

In [None]:
// Output our within cluster metrics.
fmt.Printf("\nCluster 1 Metric: %0.2f\n", withinClusterMean(clusterOne, centroids[0]))
fmt.Printf("\nCluster 2 Metric: %0.2f\n", withinClusterMean(clusterTwo, centroids[1]))

# Exercise: k-means on digits

Try the k-means algorithm on some [simple digits data](http://archive.ics.uci.edu/ml/datasets/Pen-Based+Recognition+of+Handwritten+Digits). The digits data consist of 1,797 samples (PNG images of digits) with 64 features, where each of the 64 features is the brightness of one pixel in an 8×8 image. 

- Load and parse the data (this is done for you already),
- then Find k-means clusters (hint - there should be 10, one for each digit in the dataset),
- then evaluate the clusters visually and/or quantitatively

## Data import and parsing

The data is in the file `optImg.zip` in the data folder. Unzip this file. Then define the following functions to import each digit image and vectorize it:

In [None]:
// loadPng loads and decodes a PNG image.
func loadPng(imageName string) (image.Image, error) {
    
    imgfile, err := os.Open(imageName)
    if err != nil {
        fmt.Println("file not found",imageName)
    }
    defer imgfile.Close()
    
    return png.Decode(imgfile)
}

In [None]:
// vectorizeImage transforms the png digit image into a vector 
// where each position represents an intensity in a pixel
func vectorizeImage(img image.Image, number int) [65]int {
    var ret [65]int
    for i:=0; i<8; i++ {
        for j:=0; j<8; j++ {
            
            // Parse the RGBA values.
            r, g, b, _ := img.At(j,i).RGBA()
            
            // Use a 256 color palet and 16 contrast augmentation.
            v := int((0.299*float64(r) + 0.587*float64(g) + 0.114*float64(b))/4095)
            ret[i*8+j] = v
        }
    }
    ret[64] = number
    return ret
}

In [None]:
// Here we need to create the dataset. Each image should be a 
// row in the data set, and there should be 8x8=64 columns
// representing each pixel of the image.

// Open the files.
files, err := ioutil.ReadDir("../data/optImg/")
if err != nil {
    fmt.Println(err)
}

// Create a slice of 64 value int slices.
var matrix [][65]int

// Fill the matrix with the vectorized image data.
for _, f := range files {
    
    // Only parse PNG images.
    if strings.HasSuffix(f.Name(), ".png") {
        
        // Load the PNG.
        img, err := loadPng("../data/optImg/"+f.Name())
        if err != nil {
            fmt.Println(err)
        }
        
        // Vectorize the image.
        arr := strings.Split(strings.Split(f.Name(),"-")[1],".")
        data := vectorizeImage(img, strconv.Atoi(arr[0]))
        
        // Append the vectorized data to the matrix.
        matrix = append(matrix, data)
    }
}

// Look an an example row in the data.
fmt.Println(matrix[0])

## Find k-means clusters

Find 10 clusters (one for each digit in the data set) using k-means. 

In [None]:
// Initialize a slice of gokmeans.Node's to
// hold our input data.
var data []gokmeans.Node

// Loop over the records creating our slice of
// gokmeans.Node's.
for _,record := range matrix {
    
    // Read in our record and check for errors.
    // Initialize a point.
    var point []float64
    
    // Fill in our point.
    for i := 0; i < 64; i++ {
        
        // Parse the float value.
        val:= float64(record[i])
      
        // Append this value to our point.
        point = append(point, val)
    }
    
    // Append our point to the k-means data.
    data = append(data, point)
}

// Generate our clusters with k-means. 10 clusters, 1000 iterations.
success, centroids := gokmeans.Train(data, 10, 1000)
if !success {
   fmt.Println("Could not generate clusters")
}


In [None]:
// Print out one of the centroids to stdout.
fmt.Println(centroids[0])

In [None]:
// now we are going to build a confusion matrix.
var confusion [10][10]int
for index,record := range matrix {
    // Read in our record and check for errors.
    // Initialize a point.
    var point []float64
    // Fill in our point.
    for i := 0; i < 64; i++ {
        // Parse the float value.
        val:= float64(record[i])
        // Append this value to our point.
         point = append(point, val)
    }
    clusterIndex := gokmeans.Nearest(point, centroids) 
    expectedIndex := record[64]
    confusion[clusterIndex][expectedIndex]++
}

In [None]:
// print values 
fmt.Println("\t Expected ")
fmt.Printf("\t0\t1\t2\t3\t4\t5\t6\t7\t8\t9 \n")
for i := 0 ;i<10; i++{
    fmt.Printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n",i,confusion[i][0],confusion[i][1],confusion[i][2],confusion[i][3],confusion[i][4],confusion[i][5],confusion[i][6],confusion[i][7],confusion[i][8],confusion[i][9])
}
