# Types of drivers in USA

### Import libraries

In [1]:
import numpy as np
import pandas as p
import matplotlib.pyplot as plt

### Load data from csv file

In [2]:
dataset = p.read_csv('speeding_feature_data.csv')

### Extract features from data
First see how the data looks like

In [3]:
dataset.head()

Unnamed: 0,Driver_ID,Distance_Feature,Speeding_Feature
0,3423311935,71.24,28
1,3423313212,52.53,25
2,3423313724,64.54,27
3,3423311373,55.69,22
4,3423310999,54.58,25


In [4]:
dataset.tail()

Unnamed: 0,Driver_ID,Distance_Feature,Speeding_Feature
3995,3423310685,160.04,10
3996,3423312600,176.17,5
3997,3423312921,170.91,12
3998,3423313630,176.14,5
3999,3423311533,168.03,9


We saw there are only two features in our dataset

In [5]:
features = ['Distance_Feature','Speeding_Feature']
x = dataset[features]

### Split the data into two parts
First import the train_test_split function from sklearn library

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test = train_test_split(x, test_size=0.5,random_state=5)

### Select a model
Since this is an unsupervised clustering problem, we will use KMeans algorithm to train our model. We want four types of drivers, so we will speicfy n_clusters=4

In [8]:
from sklearn.cluster import KMeans

In [9]:
model = KMeans(n_clusters=4)

### Train the model with fit function
In unsupervised learning, we do not give labels but only features and let the model form clusters of data

In [10]:
model.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

### View labels returned by our model
KMeans will assign labels to the clusters. These labels are stored in its labels_  attribute

In [11]:
labels = model.labels_
labels

array([2, 1, 1, ..., 1, 1, 1], dtype=int32)

### Get unique labels
So we will use numpy.unique() function that will give us only unique labels

In [12]:
np.unique(labels)

array([0, 1, 2, 3], dtype=int32)

### Test the model

In [13]:
y_pred = model.predict(X_test)

In [14]:
y_pred

array([1, 0, 1, ..., 2, 2, 2], dtype=int32)