# Machine Learning Engineer Nanodegree
## Capstone project
Kyle McMillan 

### Stage 1
First stage is to load all the necessary modules and the dataset. The dataset is loaded and the preprocessing is carried out on the data in preperation for the classifier algorithm.

In [74]:
#Load all necessary modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import fbeta_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import dateutil.parser
from sklearn.cross_validation import cross_val_score
from sklearn.utils import resample

In [75]:
#Import the dataset
data_file = '~/Documents/Udacity/capstone/earthquakes_20170113.csv'
df = pd.read_csv(data_file, dtype={"publicid": str})

# Drop values less than 1800s due to datetime limitations
df=df.drop(df.index[586892: ])

#Remove event type rows with these values - not related to tectonic earthquakes
df = df[df.eventtype != "snow avalanche"]
df = df[df.eventtype != "outside of network interest"]
df = df[df.eventtype != "landslide"]
df = df[df.eventtype != "sonic boom"]
df = df[df.eventtype != "debris avalanche"]
df = df[df.eventtype != "not locatable"]
df = df[df.eventtype != "quarry blast"]
df = df[df.eventtype != "explosion"]
df = df[df.eventtype != "duplicate"]
df = df[df.eventtype != "volcanic eruption"]
df = df[df.eventtype != "nuclear explosion"]
df = df[df.eventtype != "induced earthquake"]
df = df[df.eventtype != "other"]

df = df.drop(df.columns[[0,1,3,8,9,10,11,12,13,14,15,16,17,18,19,20]], axis=1)

#Remove these magnitudes values - data was not measured accurately
df = df[df.magnitude >= 0]

In [76]:
# Split timestamp date out for conversion from ISO 8610 to seconds
ISO_8610 = df['origintime']
raw_features = df.drop('origintime', axis = 1)

# Converting the ISO 8601 timestamp to seconds
seconds = []
for x in ISO_8610:
    utc_dt = datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ')
    seconds.append((utc_dt - datetime(1900, 1, 1)).total_seconds())  
    
raw_features['origintime_s'] = seconds

In [77]:
#Convert the earthquake values into 0 for less than magnitude 5.0
# and 1 for values 5.0 and above.
quakes = []
for x in raw_features.magnitude:
    if x < 5:
        quakes.append(0)
    else:
        quakes.append(1)
raw_features['quakes'] = quakes
raw_features = raw_features.drop('magnitude', axis= 1)

In [78]:
# View the count for each magnitude value before up-sampling
raw_features.quakes.value_counts()

0    566934
1      2246
Name: quakes, dtype: int64

In [79]:
# Select six random events from the dataset
# Three values are events with a 5.0 or greater earthquake and 3 are less than 5.0
samples_1 = pd.DataFrame(raw_features[raw_features.quakes == 1].sample(n=3, random_state=42), 
                         columns = raw_features.keys()).reset_index(drop = True)
samples_0 = pd.DataFrame(raw_features[raw_features.quakes == 0].sample(n=3, random_state=42), 
                         columns = raw_features.keys()).reset_index(drop = True)

total_samples = pd.concat([samples_0, samples_1])

display(total_samples)

Unnamed: 0,longitude,latitude,depth,origintime_s,quakes
0,166.78749,-46.36553,5.0,3448276000.0,0
1,174.54823,-41.28537,30.6807,2736774000.0,0
2,175.287182,-41.204894,21.640625,3673823000.0,0
0,178.00999,-36.71,12.0,1773488000.0,1
1,166.77095,-45.38336,20.1431,3270459000.0,1
2,176.55,-37.23,348.0,2083324000.0,1


### Stage 2
The second stage is to calculate the Naive predictor using the processed data.

In [80]:
# Calculating accuracy, precision and recall
accuracy = np.sum(quakes) / float(len(quakes))
recall = np.sum(quakes) / float(np.sum(quakes) + 0)
precision = np.sum(quakes) / float(np.sum(quakes) + quakes.count(0))

# Calculate F-score using beta = 1.25.
beta = 1.25
fscore = ((1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall))

# Print the results 
print ("Naive predictor: [Accuracy score: {}, F-score: {}]".format(accuracy, fscore))


Naive predictor: [Accuracy score: 0.003946027618679504, F-score: 0.010049732457494956]


### Stage 3
The third stage is to up-sample the data. Because the original dataset is heavily imbalanced,
this actions ensures that when the training and testing sets are split, all of the minority data doesn't end up 
in either only training or only testing.

In [81]:
#Up-sampling of the earthquake data so that dataset 
# is no long imbalanced.

quakes_majority = raw_features[raw_features.quakes==0]
quakes_minority = raw_features[raw_features.quakes==1]

quakes_minority_upsampled = resample(quakes_minority, replace=True, n_samples=566934, random_state=42)
quakes_upsampled = pd.concat([quakes_majority, quakes_minority_upsampled])

In [82]:
# View the count for each magnitude value after up-sampling
quakes_upsampled.quakes.value_counts()

1    566934
0    566934
Name: quakes, dtype: int64

### Stage 4
The data is split into features and lables along with testing and training sets for both.

In [83]:
# Split quake data into features and target label

quake_lables = quakes_upsampled['quakes']
quake_features = quakes_upsampled.drop('quakes', axis = 1)

In [84]:
#split the up-sampled dataset into training(70%) and testing(30%) sets
X_train, X_test, y_train, y_test = train_test_split(quake_features, quake_lables, test_size = 0.3, random_state = 42)


# Show the results of the split
print ("The training set has {} samples.".format(X_train.shape[0]))
print ("The testing set has {} samples.".format(X_test.shape[0]))

The training set has 793707 samples.
The testing set has 340161 samples.


### Stage 5
The Random Forest Classifier is trained and tested

In [85]:
clf = RandomForestClassifier(n_estimators=500, 
                             oob_score=True, 
                             min_samples_leaf=2, 
                             max_features="sqrt", 
                             max_depth=50, 
                             random_state = 42)
results = {}
    
clf = clf.fit(X_train, y_train)
        
#Get the predictions on the test set then get predictions on the training samples
predictions_test = clf.predict(X_test)
predictions_train = clf.predict(X_train)
          
#Compute F-score on the the training samples using f-beta
results['f_train'] = fbeta_score(y_train, predictions_train, 1.25)
#Compute F-score on the test set using f-beta
results['f_test'] = fbeta_score(y_test, predictions_test,1.25)
       

print ("min_samples_leaf = {}".format(x))    
print (results)

min_samples_leaf = 6.8
{'f_train': 0.99988596467190438, 'f_test': 0.99926601422212125}


### Stage 6
This final section will test the classifier.
Six sample points removed above will be inputted into the classifer to dertermine if the classifer can correctly identify the sample

<br>Additionly, a single new point was recorded as a magnitude 5.06, but was recorded approximatly two weeks after the original dataset was downloaded, will be use to test if the classifer can correctly future points.

In [86]:
#Prediciton of the sample sets

#'quakes' column is dropped as that is the answer trying to predict
print(clf.predict(total_samples.drop('quakes', axis= 1)))
print(clf.predict_proba(total_samples.drop('quakes', axis= 1)))

[0 0 0 1 1 1]
[[  1.00000000e+00   0.00000000e+00]
 [  1.00000000e+00   0.00000000e+00]
 [  1.00000000e+00   0.00000000e+00]
 [  8.62419456e-04   9.99137581e-01]
 [  7.28752921e-04   9.99271247e-01]
 [  3.83979189e-04   9.99616021e-01]]


In [87]:
#Import the dataset with a future dated earthquake 
data_file = '~/Documents/Udacity/capstone/test_earthquake.csv'
df_test = pd.read_csv(data_file, dtype={"publicid": str})

df_test = df_test.drop(df_test.columns[[0,1,3,6,8,9,10,11,12,13,14,15,16,17,18,19,20]], axis=1)

ISO_8610 = df_test['origintime']
test_features = df_test.drop('origintime', axis = 1)
seconds = []
for x in ISO_8610:
    utc_dt = datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ')
    seconds.append((utc_dt - datetime(1900, 1, 1)).total_seconds())
test_features['origintime_s'] = seconds

test_features['quakes'] = 1

print(test_features)

    longitude   latitude       depth  origintime_s  quakes
0  173.974579 -40.340115  123.553619  3.726009e+09       1


In [88]:
#Prediciton of the future point

print(clf.predict(test_features.drop('quakes', axis= 1)))
print(clf.predict_proba(test_features.drop('quakes', axis= 1)))

[0]
[[ 1.  0.]]
