## Apress - Industrialized Machine Learning Examples

Andreas Francois Vermeulen
2019

### This is an example add-on to a book and needs to be accepted as part of that copyright.

# Chapter-015-08-Build-RAPTOR-Mars-02

## Supervised Learning

![RAPTOR-QUBE](../../images/raptorqube/raptorqube-full.bmp)

In [1]:
import datetime
nowStart = datetime.datetime.now()

In [2]:
from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score, matthews_corrcoef, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
%matplotlib inline

In [3]:
def make_meshgrid(x, y, h=1):
    x_min, x_max = int(x.min()) - 1, int(x.max()) + 1
    y_min, y_max = int(y.min()) - 1, int(y.max()) + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return xx, yy

In [4]:
def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out    

In [5]:
pathDL='../../Results/Chapter 15/999-DL/'

![Processor](../../images/Processor.JPG)

![RIF Functional Layer - Transform Step](../../images/RIF-FL-TRF.JPG) ![Curated Zone](../../images/DL-CUZ.JPG)

In [6]:
objectLinkName = os.path.join(pathDL,'300-Curated-Zone', 'Link', 'Object-Object', 'Link-Object-LoadBay-MarsHopper-Ore.csv.gz')

In [7]:
object1DF=pd.read_csv(objectLinkName, header=0, encoding='utf-8', compression='gzip')
object1DF.index.name = 'ID'

In [8]:
print(object1DF.columns)

Index([u'ID', u'ObjectHopper', u'ObjectOre', u'LoadBayID', u'Tonnage'], dtype='object')


In [9]:
print(object1DF.describe())

                 ID     LoadBayID       Tonnage
count  2.919739e+06  2.919739e+06  2.919739e+06
mean   1.459869e+06  2.501647e+00  5.997769e+01
std    8.428562e+05  1.118402e+00  2.339773e+01
min    0.000000e+00  1.000000e+00  2.000000e+01
25%    7.299345e+05  2.000000e+00  4.000000e+01
50%    1.459869e+06  3.000000e+00  6.000000e+01
75%    2.189804e+06  4.000000e+00  8.000000e+01
max    2.919738e+06  4.000000e+00  1.000000e+02


In [10]:
print(object1DF.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919739 entries, 0 to 2919738
Data columns (total 5 columns):
ID              int64
ObjectHopper    object
ObjectOre       object
LoadBayID       int64
Tonnage         int64
dtypes: int64(3), object(2)
memory usage: 111.4+ MB
None


In [11]:
print(object1DF.head(20))

    ID ObjectHopper  ObjectOre  LoadBayID  Tonnage
ID                                                
0    0  Hopper-0001   aluminum          1       78
1    1  Hopper-0001   aluminum          1       78
2    2  Hopper-0002   aluminum          1       64
3    3  Hopper-0002   aluminum          1       64
4    4  Hopper-0002   aluminum          1       64
5    5  Hopper-0001   aluminum          1       78
6    6  Hopper-0003  magnesium          4       55
7    7  Hopper-0003  magnesium          4       55
8    8  Hopper-0004   aluminum          4       54
9    9  Hopper-0004   aluminum          4       54
10  10  Hopper-0005   aluminum          2       45
11  11  Hopper-0005   aluminum          2       45
12  12  Hopper-0005   aluminum          2       45
13  13  Hopper-0006    calcium          4       41
14  14  Hopper-0006    calcium          4       41
15  15  Hopper-0007    silicon          3       27
16  16  Hopper-0007    silicon          3       27
17  17  Hopper-0007    silicon 

![Processor](../../images/hopper.bmp)

In [12]:
def loadrate(t):
    rate = int(t/60)
    return rate

In [13]:
objectSub1DF=object1DF[['ObjectOre','LoadBayID','Tonnage']].copy(deep=True)
objectSub1DF.drop_duplicates(subset=None, keep='first', inplace=True)
objectSub1DF.columns=['OreName','MineID','Tonnage']
objectSub1DF['TonnageRate'] = objectSub1DF.apply(lambda row: loadrate(row['Tonnage']), axis=1)
objectSub1DF.drop('Tonnage', axis=1, inplace=True)
objectSub1DF.drop_duplicates(subset=None, keep='first', inplace=True)

In [14]:
print(objectSub1DF.head(10))

      OreName  MineID  TonnageRate
ID                                
0    aluminum       1            1
6   magnesium       4            0
8    aluminum       4            0
10   aluminum       2            0
13    calcium       4            0
15    silicon       3            0
22  magnesium       2            0
28       iron       4            1
30       iron       3            1
34    silicon       3            1


In [15]:
objectSatelliteName = os.path.join(pathDL,'300-Curated-Zone', 'Satellite', 'Object', 'Satellite-Object-Ore.csv.gz')

In [16]:
object2DF=pd.read_csv(objectSatelliteName, header=0, encoding='utf-8', compression='gzip')
object2DF.index.name = 'ID'

In [17]:
print(object2DF.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
ObjectName    6 non-null object
ObjectType    6 non-null object
dtypes: object(2)
memory usage: 168.0+ bytes
None


In [18]:
print(object2DF.head(20))

   ObjectName ObjectType
ID                      
0    aluminum        Ore
1   magnesium        Ore
2     calcium        Ore
3     silicon        Ore
4        iron        Ore
5   potassium        Ore


In [19]:
objectSub2DF=object2DF[['ObjectName']].copy(deep=True)
objectSub2DF.drop_duplicates(subset=None, keep='first', inplace=True)
objectSub2DF.reset_index(inplace=True)
objectSub2DF.columns=['OreID','OreName']
objectSub2DF.index.name = 'ID'

In [20]:
print(objectSub2DF.head(20))

    OreID    OreName
ID                  
0       0   aluminum
1       1  magnesium
2       2    calcium
3       3    silicon
4       4       iron
5       5  potassium


In [21]:
print(objectSub2DF.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
OreID      6 non-null int64
OreName    6 non-null object
dtypes: int64(1), object(1)
memory usage: 168.0+ bytes
None


In [22]:
objectDF=pd.merge(left=objectSub1DF, right=objectSub2DF, on='OreName')
objectDF.drop('OreName', axis=1, inplace=True)
objectDF.drop_duplicates(subset=None, keep='first', inplace=True)

In [23]:
print(objectDF.columns)

Index([u'MineID', u'TonnageRate', u'OreID'], dtype='object')


## Training Set

In [24]:
#objectTrainDF=objectDF.sample(n=1000, replace=True)
#objectTrainDF.drop_duplicates(subset=None, keep='first', inplace=True)

In [25]:
objectTrainDF=objectDF.sample(frac=0.7, replace=False)
#objectTrainDF.drop_duplicates(subset=None, keep='first', inplace=True)

In [26]:
print(objectTrainDF.shape)

(34, 3)


In [27]:
Xtrain=np.array(objectTrainDF[['OreID','MineID']])
ytrain=np.array(objectTrainDF[['TonnageRate'][0]])

In [28]:
print(Xtrain.shape)

(34L, 2L)


In [29]:
print(ytrain.shape)

(34L,)


## Test Set

In [30]:
objectTestDF=objectDF.sample(frac=0.3, replace=True)
#objectTestDF.drop_duplicates(subset=None, keep='first', inplace=True)

In [31]:
print(objectTestDF.shape)

(14, 3)


In [32]:
Xtest=np.array(objectTestDF[['OreID','MineID']])
ytest=np.array(objectTestDF[['TonnageRate'][0]])

In [33]:
print(Xtest.shape)

(14L, 2L)


In [34]:
print(ytest.shape)

(14L,)


![Processor](../../images/Processor.JPG)

In [35]:
clf = LinearSVC(random_state=0,
                max_iter=20000,
                dual=True)

In [36]:
clf.fit(Xtrain, ytrain)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=20000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0)

In [37]:
print('------------------------------------------------')
print('Coefficient:', clf.coef_)
print('------------------------------------------------')
print('Intercept', clf.intercept_)
print('------------------------------------------------')
print('Score', clf.score(Xtrain, ytrain))
print('------------------------------------------------')  

------------------------------------------------
('Coefficient:', array([[-0.0182217 ,  0.11647044]]))
------------------------------------------------
('Intercept', array([-0.10997693]))
------------------------------------------------
('Score', 0.5882352941176471)
------------------------------------------------


In [38]:
IMLresult=np.array(clf.predict(Xtest))

In [39]:
print(IMLresult.shape)

(14L,)


In [40]:
print('------------------------------------------------')
print('Score - Test Data', clf.score(Xtest, ytest))
print('------------------------------------------------')

------------------------------------------------
('Score - Test Data', 0.35714285714285715)
------------------------------------------------


In [41]:
print('Test Data Run')
for i in range(IMLresult.shape[0]):
    if ytest[0]==IMLresult[0]:
        outcome='Hit!'
    else:
        outcome='Miss?'
    print('%04d - %04d ore with %02d mine => %0.4f true rate => %0.4f predicted - %s' % (i+1, Xtest[i][0],Xtest[i][1],ytest[i],IMLresult[i],outcome))

Test Data Run
0001 - 0002 ore with 03 mine => 0.0000 true rate => 1.0000 predicted - Miss?
0002 - 0005 ore with 02 mine => 1.0000 true rate => 1.0000 predicted - Miss?
0003 - 0002 ore with 04 mine => 0.0000 true rate => 1.0000 predicted - Miss?
0004 - 0001 ore with 01 mine => 0.0000 true rate => 0.0000 predicted - Miss?
0005 - 0004 ore with 01 mine => 1.0000 true rate => 0.0000 predicted - Miss?
0006 - 0003 ore with 03 mine => 0.0000 true rate => 1.0000 predicted - Miss?
0007 - 0001 ore with 01 mine => 0.0000 true rate => 0.0000 predicted - Miss?
0008 - 0001 ore with 04 mine => 0.0000 true rate => 1.0000 predicted - Miss?
0009 - 0001 ore with 04 mine => 0.0000 true rate => 1.0000 predicted - Miss?
0010 - 0004 ore with 04 mine => 1.0000 true rate => 1.0000 predicted - Miss?
0011 - 0000 ore with 04 mine => 0.0000 true rate => 1.0000 predicted - Miss?
0012 - 0005 ore with 04 mine => 1.0000 true rate => 1.0000 predicted - Miss?
0013 - 0002 ore with 01 mine => 1.0000 true rate => 0.0000 pre

# Process Done

In [42]:
nowStop = datetime.datetime.now()
runTime=nowStop-nowStart
print('Start:', nowStart.strftime('%Y-%m-%d %H:%M:%S'))
print('Stop: ', nowStop.strftime('%Y-%m-%d %H:%M:%S'))
print('Time: ', runTime)

('Start:', '2019-05-04 22:29:44')
('Stop: ', '2019-05-04 22:30:21')
('Time: ', datetime.timedelta(0, 37, 49000))
