In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV


In [21]:
#Loading the data
ob1 = pd.read_excel('EXP-00004-Master.xlsx')
ob2 = pd.read_excel('EXP-00005-Master.xlsx')

In [22]:
#Seeing what each dataset looks like
print('Obsidian 1 summary', ob1.describe())
print('Obsidian 1 summary', ob2.describe())

Obsidian 1 summary            Id  Img Id         Da         Dp     FWidth    FLength  FThickness  \
count   36682   36682  36682.000  36682.000  36682.000  36682.000   36682.000   
unique  23382   13450   2642.000   3246.000   2691.000   4285.000    1412.000   
top         9   11917      0.385      0.429      0.365      0.849       0.132   
freq        5      14    117.000     87.000    107.000     47.000     253.000   

          ELength  EThickness     EWidth  ...  L/W Ratio  W/L Ratio  \
count   36682.000   36682.000  36682.000  ...  36682.000  36682.000   
unique   4235.000    1404.000   2659.000  ...   3259.000    791.000   
top         0.714       0.129      0.322  ...      1.418      0.556   
freq       52.000     250.000    111.000  ...     53.000    117.000   

        W/T Ratio  T/W Ratio  CHull Surface Area      Sieve  Angularity  \
count       36682      36682           36682.000  36682.000       36682   
unique       4433        872            7416.000   2036.000        49

In [23]:
#Setting what stage in the tool-making process each dataset is in
ob1['Stage'] = 4
ob2['Stage'] = 5

#Merging the two datasets together
df = pd.concat([ob1, ob2])

df

Unnamed: 0,Id,Img Id,Da,Dp,FWidth,FLength,FThickness,ELength,EThickness,EWidth,...,W/L Ratio,W/T Ratio,T/W Ratio,CHull Surface Area,Sieve,Angularity,Ellipticity,Fiber Length,Fiber Width,Stage
0,,,mm,mm,mm,mm,mm,mm,mm,mm,...,,,,mm²,mm,,,mm,mm,4
1,5903,14271,0.101,0.255,0.167,0.257,0.167,0.234,0.106,0.106,...,0.652,1,1,0.097,0.167,88.889,2.216,0.38,0.021,4
2,3002,11576,0.072,0.241,0.127,0.203,0.127,0.157,0.081,0.081,...,0.624,1,1,0.068,0.127,87.5,1.942,0.367,0.011,4
3,9893,18211,0.077,0.199,0.13,0.172,0.13,0.114,0.1,0.1,...,0.758,1,1,0.053,0.13,81.667,1.139,0.297,0.016,4
4,3713,12265,0.127,0.381,0.161,0.35,0.161,0.413,0.112,0.112,...,0.459,1,1,0.168,0.161,72.381,3.686,0.563,0.022,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12809,5,3174,4.223,5.244,4.93,6.905,1.058,6.783,0.983,4.779,...,0.714,4.661,0.215,58.79,2.994,17.639,6.903,6.446,0.761,5
12810,3,2862,2.476,3.962,1.902,6.159,0.875,5.921,0.778,1.743,...,0.309,2.174,0.46,20.966,1.389,25.781,7.613,5.781,0.5,5
12811,16,4107,1.545,2.02,1.192,3.517,0.563,3.257,0.542,1.132,...,0.339,2.118,0.472,8.343,0.877,39.063,6.004,3.515,0.764,5
12812,21,4453,0.388,0.432,0.438,0.558,0.165,0.514,0.156,0.399,...,0.784,2.66,0.376,0.491,0.301,48.889,3.29,0.399,0.368,5


In [24]:
#Removing unnecessary column and re-indexing
df.drop(index = 0, inplace = True)
df.reset_index(drop = True, inplace = True)

In [25]:
#Dropping columns that were irrelevant towards our target
df.drop(columns = ['Id','Filter0','Filter1', 'Filter2', 'Filter3', 'Filter4', 'Filter5', 'Filter6', 'hash'], inplace = True)
df

Unnamed: 0,Img Id,Da,Dp,FWidth,FLength,FThickness,ELength,EThickness,EWidth,Volume,...,W/L Ratio,W/T Ratio,T/W Ratio,CHull Surface Area,Sieve,Angularity,Ellipticity,Fiber Length,Fiber Width,Stage
0,14271,0.101,0.255,0.167,0.257,0.167,0.234,0.106,0.106,0.004,...,0.652,1,1,0.097,0.167,88.889,2.216,0.38,0.021,4
1,11576,0.072,0.241,0.127,0.203,0.127,0.157,0.081,0.081,0.002,...,0.624,1,1,0.068,0.127,87.5,1.942,0.367,0.011,4
2,18211,0.077,0.199,0.13,0.172,0.13,0.114,0.1,0.1,0.002,...,0.758,1,1,0.053,0.13,81.667,1.139,0.297,0.016,4
3,12265,0.127,0.381,0.161,0.35,0.161,0.413,0.112,0.112,0.005,...,0.459,1,1,0.168,0.161,72.381,3.686,0.563,0.022,4
4,16105,0.145,0.333,0.179,0.309,0.179,0.244,0.119,0.119,0.005,...,0.58,1,1,0.143,0.179,82.5,2.059,0.489,0.034,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49489,3174,4.223,5.244,4.93,6.905,1.058,6.783,0.983,4.779,18.855,...,0.714,4.661,0.215,58.79,2.994,17.639,6.903,6.446,0.761,5
49490,2862,2.476,3.962,1.902,6.159,0.875,5.921,0.778,1.743,5.369,...,0.309,2.174,0.46,20.966,1.389,25.781,7.613,5.781,0.5,5
49491,4107,1.545,2.02,1.192,3.517,0.563,3.257,0.542,1.132,1.235,...,0.339,2.118,0.472,8.343,0.877,39.063,6.004,3.515,0.764,5
49492,4453,0.388,0.432,0.438,0.558,0.165,0.514,0.156,0.399,0.021,...,0.784,2.66,0.376,0.491,0.301,48.889,3.29,0.399,0.368,5


In [26]:
#Changing all datapoints from "object" to numeric values
df = df.apply(pd.to_numeric)
df.dtypes

Img Id                  int64
Da                    float64
Dp                    float64
FWidth                float64
FLength               float64
FThickness            float64
ELength               float64
EThickness            float64
EWidth                float64
Volume                float64
Area                  float64
Perimeter             float64
CHull  Area           float64
CHull Perimeter       float64
Sphericity            float64
L/T Ratio             float64
T/L Aspect Ratio      float64
Compactness           float64
Roundness             float64
Ellipse Ratio         float64
Circularity           float64
Solidity              float64
Concavity             float64
Convexity             float64
Extent                float64
Transparency          float64
Curvature             float64
Surface Area          float64
L/W Ratio             float64
W/L Ratio             float64
W/T Ratio             float64
T/W Ratio             float64
CHull Surface Area    float64
Sieve     

In [27]:
#Removing rows swith missing values
df.dropna(inplace = True)
df.isna().sum()

Img Id                0
Da                    0
Dp                    0
FWidth                0
FLength               0
FThickness            0
ELength               0
EThickness            0
EWidth                0
Volume                0
Area                  0
Perimeter             0
CHull  Area           0
CHull Perimeter       0
Sphericity            0
L/T Ratio             0
T/L Aspect Ratio      0
Compactness           0
Roundness             0
Ellipse Ratio         0
Circularity           0
Solidity              0
Concavity             0
Convexity             0
Extent                0
Transparency          0
Curvature             0
Surface Area          0
L/W Ratio             0
W/L Ratio             0
W/T Ratio             0
T/W Ratio             0
CHull Surface Area    0
Sieve                 0
Angularity            0
Ellipticity           0
Fiber Length          0
Fiber Width           0
Stage                 0
dtype: int64

In [28]:
#Creating a list of the features and target
FEATURES = df.columns[:-1]

TARGET = df.columns[-1]


#Spliting our data for training and testing
x_train, x_test, y_train, y_test = train_test_split(
    df[FEATURES],
    df[TARGET],
    test_size = 0.2,
    random_state = 10
)

#Creating a decision tree object
dt = tree.DecisionTreeClassifier(
    criterion = 'entropy',
    max_depth = 5,
    random_state = 10
)

#Training our data
dt.fit(x_train, y_train)

#Making Predictions
prediction = dt.predict(x_test)

#Calculating our accuracy
acc = metrics.accuracy_score(y_test, prediction)
print('Accuracy: ', acc)

Accuracy:  0.8505909687847257
