In [None]:
!python --version

Python 3.7.10


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import seaborn as sns
import regex as re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [None]:
def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):
    '''
    This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.
    Arguments
    ---------
    cf:            confusion matrix to be passed in
    group_names:   List of strings that represent the labels row by row to be shown in each square.
    categories:    List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'
    count:         If True, show the raw number in the confusion matrix. Default is True.
    normalize:     If True, show the proportions for each category. Default is True.
    cbar:          If True, show the color bar. The cbar values are based off the values in the confusion matrix.
                   Default is True.
    xyticks:       If True, show x and y ticks. Default is True.
    xyplotlabels:  If True, show 'True Label' and 'Predicted Label' on the figure. Default is True.
    sum_stats:     If True, display summary statistics below the figure. Default is True.
    figsize:       Tuple representing the figure size. Default will be the matplotlib rcParams value.
    cmap:          Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues'
                   See http://matplotlib.org/examples/color/colormaps_reference.html
                   
    title:         Title for the heatmap. Default is None.
    '''


    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    
    
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)

In [None]:
dino_df = pd.read_csv('/content/drive/MyDrive/Dinobb/data.csv')
dino_df.head()

Unnamed: 0,name,diet,period,lived_in,type,length,taxonomy,named_by,species,link
0,aardonyx,herbivorous,Early Jurassic 199-189 million years ago,South Africa,sauropod,8.0m,Dinosauria Saurischia Sauropodomorpha Prosauro...,Yates Bonnan Neveling Chinsamy and Blackbeard ...,celestae,https://www.nhm.ac.uk/discover/dino-directory/...
1,abelisaurus,carnivorous,Late Cretaceous 74-70 million years ago,Argentina,large theropod,9.0m,Dinosauria Saurischia Theropoda Neotheropoda C...,Bonaparte and Novas (1985),comahuensis,https://www.nhm.ac.uk/discover/dino-directory/...
2,achelousaurus,herbivorous,Late Cretaceous 83-70 million years ago,USA,ceratopsian,6.0m,Dinosauria Ornithischia Genasauria Cerapoda Ma...,Sampson (1995),horneri,https://www.nhm.ac.uk/discover/dino-directory/...
3,achillobator,carnivorous,Late Cretaceous 99-84 million years ago,Mongolia,large theropod,5.0m,Dinosauria Saurischia Theropoda Neotheropoda T...,Perle Norell and Clark (1999),giganteus,https://www.nhm.ac.uk/discover/dino-directory/...
4,acrocanthosaurus,carnivorous,Early Cretaceous 115-105 million years ago,USA,large theropod,12.0m,Dinosauria Saurischia Theropoda Neotheropoda T...,Stovall and Langston (1950),atokensis,https://www.nhm.ac.uk/discover/dino-directory/...


In [None]:
dino_df.shape

(309, 10)

##Data Cleaningn

First we delete the useless features.

In [None]:
dino_df.drop(['taxonomy', 'named_by', 'species', 'link'], axis = 1, inplace = True)
dino_df

Unnamed: 0,name,diet,period,lived_in,type,length
0,aardonyx,herbivorous,Early Jurassic 199-189 million years ago,South Africa,sauropod,8.0m
1,abelisaurus,carnivorous,Late Cretaceous 74-70 million years ago,Argentina,large theropod,9.0m
2,achelousaurus,herbivorous,Late Cretaceous 83-70 million years ago,USA,ceratopsian,6.0m
3,achillobator,carnivorous,Late Cretaceous 99-84 million years ago,Mongolia,large theropod,5.0m
4,acrocanthosaurus,carnivorous,Early Cretaceous 115-105 million years ago,USA,large theropod,12.0m
...,...,...,...,...,...,...
304,yuanmousaurus,herbivorous,Mid Jurassic 180-159 million years ago,China,sauropod,17.0m
305,yunnanosaurus,omnivorous,Early Jurassic 205-190 million years ago,China,sauropod,7.0m
306,zalmoxes,herbivorous,Late Cretaceous 69 million years ago,Romania,euornithopod,3.0m
307,zephyrosaurus,herbivorous,Early Cretaceous 120-110 million years ago,USA,euornithopod,1.8m


In [None]:
dino_df.columns

Index(['name', 'diet', 'period', 'lived_in', 'type', 'length'], dtype='object')

Let's deal with missing information:

In [None]:
dino_df.isna().sum()

name         0
diet         0
period       0
lived_in     1
type         0
length      18
dtype: int64

In [None]:
dino_df = dino_df.dropna(axis=0, subset=['lived_in'])

In [None]:
dino_df.isna().sum()

name         0
diet         0
period       0
lived_in     0
type         0
length      18
dtype: int64

In [None]:
dino_df.length = dino_df.length.fillna(0)
dino_df.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


name        0
diet        0
period      0
lived_in    0
type        0
length      0
dtype: int64

##Then we solve the string problem in the lenght column:

In [None]:
dino_df.length = dino_df.length.str.replace('m', '')
dino_df.length


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


0       8.0
1       9.0
2       6.0
3       5.0
4      12.0
       ... 
304    17.0
305     7.0
306     3.0
307     1.8
308     NaN
Name: length, Length: 308, dtype: object

In [None]:
print(type(dino_df.length[308]))

<class 'float'>


After all, Nan keep aparing in the DataFrame, let's destroy them.

In [None]:
dino_df.length = dino_df.length.replace(np.nan, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [None]:
dino_df.length

0       8.0
1       9.0
2       6.0
3       5.0
4      12.0
       ... 
304    17.0
305     7.0
306     3.0
307     1.8
308       0
Name: length, Length: 308, dtype: object

Then we cast this column:

In [None]:
dino_df.length = pd.to_numeric(dino_df.length)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


And replace 0 with average:

In [None]:
#dino_df=dino_df.replace({'length': {0: dino_df.length.mean()}})
dino_df.length = dino_df.length.replace(0, dino_df.length.mean())
dino_df.length


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


0       8.000000
1       9.000000
2       6.000000
3       5.000000
4      12.000000
         ...    
304    17.000000
305     7.000000
306     3.000000
307     1.800000
308     6.793409
Name: length, Length: 308, dtype: float64

##Let's get the period values.

In [None]:
period = list(dino_df.period)

In [None]:
for element in range(len(period)):
  period[element] = ''.join(re.findall(r'\w+\s\w+\s\d', period[element]))

In [None]:
for element in range(len(period)):
  period[element] = re.sub(r'\s\d', '',period[element])

In [None]:
dino_df.period = period

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [None]:
dino_df.period.value_counts()

Late Cretaceous     139
Early Cretaceous     63
Late Jurassic        43
Mid Jurassic         26
Early Jurassic       19
Late Triassic        16
                      2
Name: period, dtype: int64

Here I'll define which ages corresponds to which historic periods: 

In [None]:
"""Late Cretaceous   10 - 15 (2012 - 2007)
Early Cretaceous  16 - 20 (2006 - 2001)
Late Jurassic     21 - 26 (2000 - 1995)
Mid Jurassic      27 - 32 (1994 - 1989)
Early Jurassic    33 - 38 (1988 - 1983)
Late Triassic     39 - 44 (1982 - 1977)"""

'Late Cretaceous   10 - 15 (2012 - 2007)\nEarly Cretaceous  16 - 20 (2006 - 2001)\nLate Jurassic     21 - 26 (2000 - 1995)\nMid Jurassic      27 - 32 (1994 - 1989)\nEarly Jurassic    33 - 38 (1988 - 1983)\nLate Triassic     39 - 44 (1982 - 1977)'

##Diet

In [None]:
dino_df.diet.value_counts()

herbivorous               185
carnivorous                94
omnivorous                 26
unknown                     2
herbivorous/omnivorous      1
Name: diet, dtype: int64

In [None]:
dino_df.diet = dino_df.diet.replace('herbivorous/omnivorous', 'omnivorous')
dino_df.diet.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


herbivorous    185
carnivorous     94
omnivorous      27
unknown          2
Name: diet, dtype: int64

I'll delete the 'unknow diet' rows, for this I need to know hich rows have this diet:

In [None]:
index_names = dino_df[ dino_df['diet'] == 'unknown' ]

index_names

Unnamed: 0,name,diet,period,lived_in,type,length
182,mononykus,unknown,Late Cretaceous,Mongolia,small theropod,1.0
281,therizinosaurus,unknown,Late Cretaceous,Mongolia,large theropod,6.793409


In [None]:
dino_df.drop([182,281], axis=0, inplace= True)
dino_df.diet.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


herbivorous    185
carnivorous     94
omnivorous      27
Name: diet, dtype: int64

## Delete uneccesary class:

In [None]:
dino_df.drop(['type'], axis = 1, inplace = True)
dino_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,name,diet,period,lived_in,length
0,aardonyx,herbivorous,Early Jurassic,South Africa,8.0
1,abelisaurus,carnivorous,Late Cretaceous,Argentina,9.0
2,achelousaurus,herbivorous,Late Cretaceous,USA,6.0
3,achillobator,carnivorous,Late Cretaceous,Mongolia,5.0
4,acrocanthosaurus,carnivorous,Early Cretaceous,USA,12.0


## Get dummies:

In [None]:
dino_df.head()

Unnamed: 0,name,diet,period,lived_in,length
0,aardonyx,herbivorous,Early Jurassic,South Africa,8.0
1,abelisaurus,carnivorous,Late Cretaceous,Argentina,9.0
2,achelousaurus,herbivorous,Late Cretaceous,USA,6.0
3,achillobator,carnivorous,Late Cretaceous,Mongolia,5.0
4,acrocanthosaurus,carnivorous,Early Cretaceous,USA,12.0


In [None]:
dino_df['length'] = MinMaxScaler().fit_transform(dino_df['length'].values.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
dino_df.describe()

Unnamed: 0,length
count,306.0
mean,0.200344
std,0.176684
min,0.0
25%,0.079137
50%,0.165468
75%,0.251799
max,1.0


## ML

In [None]:
x = dino_df.drop(['name'], axis = 1)
y = dino_df.name

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

In [None]:
list(set(x_train.lived_in.to_list()))

['India',
 'Uruguay',
 'Kazakhstan',
 'Spain',
 'Australia',
 'Madagascar',
 'United Kingdom',
 'Zimbabwe',
 'USA',
 'France',
 'Mongolia',
 'Switzerland',
 'South Africa',
 'Malawi',
 'Wales',
 'Tunisia',
 'Niger',
 'Germany',
 'Romania',
 'North Africa',
 'China',
 'Japan',
 'Brazil',
 'Uzbekistan',
 'Egypt',
 'Tanzania',
 'Russia',
 'Canada',
 'Morocco',
 'Argentina']

In [None]:
# Create the encoder.
encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(x_train)    # Assume for simplicity all features are categorical.

# Apply the encoder.
x_train = encoder.transform(x_train)
x_test = encoder.transform(x_test)

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(214, 97)
(214,)
(92, 97)
(92,)


Parameter grid & random search

In [None]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = rf.predict(x_test)


In [None]:
y_pred

array(['ouranosaurus', 'lapparentosaurus', 'chungkingosaurus',
       'heyuannia', 'antarctosaurus', 'euoplocephalus', 'heyuannia',
       'herrerasaurus', 'chaoyangsaurus', 'alectrosaurus', 'paralititan',
       'garudimimus', 'gobisaurus', 'sonidosaurus', 'dromaeosaurus',
       'avimimus', 'camarasaurus', 'shunosaurus', 'harpymimus',
       'equijubus', 'edmontonia', 'alioramus', 'podokesaurus',
       'antarctosaurus', 'velociraptor', 'eoraptor', 'alectrosaurus',
       'avimimus', 'shantungosaurus', 'dryosaurus', 'dicraeosaurus',
       'othnielia', 'amygdalodon', 'neovenator', 'shantungosaurus',
       'hypacrosaurus', 'lophostropheus', 'yingshanosaurus',
       'alamosaurus', 'arrhinoceratops', 'maxakalisaurus',
       'hesperosaurus', 'garudimimus', 'dicraeosaurus', 'chaoyangsaurus',
       'ornithomimus', 'udanoceratops', 'albertosaurus', 'massospondylus',
       'nemegtosaurus', 'eustreptospondylus', 'archaeornithomimus',
       'coloradisaurus', 'spinosaurus', 'garudimimus',

In [None]:
x_train

<214x97 sparse matrix of type '<class 'numpy.float64'>'
	with 856 stored elements in Compressed Sparse Row format>

In [None]:
classes = rf.classes_
classes

array(['aardonyx', 'abelisaurus', 'achelousaurus', 'acrocanthosaurus',
       'alamosaurus', 'albertosaurus', 'alectrosaurus', 'alioramus',
       'allosaurus', 'alvarezsaurus', 'amargasaurus', 'ammosaurus',
       'amygdalodon', 'anchisaurus', 'ankylosaurus', 'antarctosaurus',
       'aralosaurus', 'archaeoceratops', 'archaeopteryx',
       'archaeornithomimus', 'arrhinoceratops', 'atlascopcosaurus',
       'austrosaurus', 'avaceratops', 'avimimus', 'bactrosaurus',
       'bagaceratops', 'bambiraptor', 'barapasaurus', 'barosaurus',
       'becklespinax', 'beipiaosaurus', 'brachiosaurus',
       'brachylophosaurus', 'brachytrachelopan', 'buitreraptor',
       'camarasaurus', 'camptosaurus', 'carcharodontosaurus',
       'carnotaurus', 'caudipteryx', 'ceratosaurus', 'chaoyangsaurus',
       'chasmosaurus', 'chinshakiangosaurus', 'chirostenotes',
       'chungkingosaurus', 'citipati', 'coelophysis', 'coelurus',
       'coloradisaurus', 'compsognathus', 'confuciusornis',
       'deinochei

In [None]:
dino_probas = rf.predict_proba(x_train)
dino_probas.shape


(214, 214)

In [None]:
dino_probas

array([[0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.05, 0.  , ..., 0.  , 0.  , 0.  ],
       ...,
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ]])

In [None]:
dino_mapping = list(zip(classes, list(dino_probas[0])))

In [None]:
dino_mapping.sort(reverse=True, key=lambda tup: tup[1]) 

In [None]:
dino_mapping[:3]

[('compsognathus', 0.62), ('archaeopteryx', 0.16), ('juravenator', 0.1)]

In [None]:
diet = "herbivorous"
period = "Early Jurassic"
lived_in = "USA"
length = 1.8
variables = [diet, period, lived_in, length]
input_df = pd.DataFrame([variables], columns = ['diet', 'period', 'lived_in', 'length'])
input_df['length'] = (length - 0.5464) / (2.72 - 0.5464)
input_df.head()


Unnamed: 0,diet,period,lived_in,length
0,herbivorous,Early Jurassic,USA,0.576739


In [None]:
# creating instance of one-hot-encoder handle_unknown='ignore'
x_input = encoder.transform(input_df)


In [None]:
x_input

<1x97 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [None]:
my_prediction = rf.predict_proba(x_input)

In [None]:
dino_mapping = list(zip(classes, list(my_prediction[0])))
dino_mapping.sort(reverse=True, key=lambda tup: tup[1]) 
dino_mapping[:3]

[('scutellosaurus', 0.21), ('ammosaurus', 0.19), ('anchisaurus', 0.16)]

In [None]:
import pickle
pickle.dump(rf, open( 'dino_model.mo', "wb" ) )
pickle.dump(encoder, open( 'encoder.mo', "wb" ) )

In [None]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.22.2.post1.
