In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

## Preprocessing

In [2]:
quake_frame = pd.read_csv('data/consolidated_data.csv')

quake_frame['simple_label'] = quake_frame['type'] != 'earthquake'

quake_frame.drop(['id', 'Unnamed: 0', 'place', 'time', 'updated', 'type'], inplace=True, axis=1)

quake_frame['has_null'] = quake_frame.isna().sum(axis=1)

In [3]:
quake_frame['has_null'].value_counts()

0     1227408
6      578546
1      466870
3      278200
5      174823
4      148396
2      145856
7      115921
8       79304
9       41584
10      15865
11          1
Name: has_null, dtype: int64

## Iterative imputation

Our Random Forest is still for two classes that takes only rows that have no nans in them. This time, we'll use the iterative imputer. Let's see how many we get.  
Then we'll split the data 80/20 and run training.  
Problem is, we'll be cheating if we impute on all the data and then test on imputed data - if we get the imputation wrong, we could not notice how far away our model is from actual performance until it hits real world data. Thus the validation set needs to be imputed data only. We'll do this by imputing on all and then take validation data only from a part of the dataset that had no nulls to begin with.  

In [4]:
quake_frame.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
latitude,3272774.0,35.720738,20.256723,-84.422,34.118,37.576167,42.258667,87.265
longitude,3272774.0,-92.856671,80.553255,-179.999,-122.79583,-118.811167,-115.454167,180.0
depth,3272765.0,22.334946,56.320328,-10.0,3.002,7.155,15.0,735.8
mag,3116325.0,1.878941,1.352506,-9.99,0.97,1.5,2.46,9.1
nst,2391208.0,15.601496,26.606866,0.0,5.0,10.0,18.0,934.0
gap,2434225.0,130.487608,69.710621,0.0,79.0,115.0,168.26,360.0
dmin,1926032.0,0.255999,1.333459,0.0,0.02093,0.05135,0.116,141.16
rms,3061121.0,0.315205,0.399901,-1.0,0.06,0.15,0.48,104.33
horizontalError,1740811.0,1.266841,3.168282,0.0,0.3,0.48,0.93,280.6
depthError,2666089.0,5.64032,1167.801181,-1.0,0.49,0.96,2.76,1773552.5


We better limit that depthError value. It will still be an outlier, but shouldn't make things too crazy.

In [5]:
quake_frame['depthError'].clip(upper=10000, inplace=True)

In order to be able to encode all the categoricals, we'll have to transform only the values that are not missing. I found a label encoder class on StackOverflow that should help sorting this out.  
We'll run that encoding and then replace the strings with pandas NaNs so the column can be considered integer and we can impute on it.  
We're not scaling and one-hot encoding because that would drive the number of columns beyond 600 and I do not have the compute to deal with that. It would be interesting to see a result, though!

In [6]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

class LabelEncoderByCol(BaseEstimator, TransformerMixin):
    def __init__(self,col):
        #List of column names in the DataFrame that should be encoded
        self.col = col
        #Dictionary storing a LabelEncoder for each column
        self.le_dic = {}
        for el in self.col:
            self.le_dic[el] = LabelEncoder()

    def fit(self,x,y=None):
        #Fill missing values with the string 'NaN'
        x[self.col] = x[self.col].fillna('NaN')
        for el in self.col:
            #Only use the values that are not 'NaN' to fit the Encoder
            a = x[el][x[el]!='NaN']
            self.le_dic[el].fit(a)
        return self

    def transform(self,x,y=None):
        #Fill missing values with the string 'NaN'
        x[self.col] = x[self.col].fillna('NaN')
        for el in self.col:
            #Only use the values that are not 'NaN' to fit the Encoder
            a = x[el][x[el]!='NaN']
            #Store an ndarray of the current column
            b = x[el].values
            #Replace the elements in the ndarray that are not 'NaN'
            #using the transformer
            b[b!='NaN'] = self.le_dic[el].transform(a)
            #Overwrite the column in the DataFrame
            x[el]=b
        #return the transformed DataFrame
        return x
    
def create_label_encoder_by_column(data, cols=None):
    if not cols:
        cols = data.columns.to_list()
    ### fit with the desired col, col in position 0 for this example
    fit_list = [data[col].unique() for col in cols]
    enc_list = [LabelEncoder().fit(col) for col in fit_list]
    return enc_list

def transform_all_but_nans(data, cols=None, name='_enc'):
    if not cols:
        cols = data.columns.to_list()
    encs = create_label_encoder_by_column(data, cols)
    for col, enc in zip(cols, encs):
        data[col + name] = data[col].apply(lambda x: enc.transform(x) if x is not None else x)
    return data



In [7]:
col = ['magType', 'net', 'status', 'locationSource', 'magSource']
le = LabelEncoderByCol(col=col)

le.fit(quake_frame)

le.transform(quake_frame)

quake_frame.replace('NaN', pd.NA)

Unnamed: 0,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,horizontalError,depthError,magError,magNst,status,locationSource,magSource,simple_label,has_null
0,37.003502,-117.996834,0.00,0.00,17,0.0,,,,3,,,,0.0,2,31,54,True,6
1,35.642788,-120.933601,5.00,1.99,17,2.0,,,,3,,,,0.0,2,31,54,False,6
2,34.164520,-118.185036,0.00,0.00,17,,,,,3,,,,0.0,2,31,54,False,7
3,33.836494,-116.781868,0.00,0.00,17,,,,,3,,,,0.0,2,31,54,True,7
4,33.208477,-115.476997,5.00,0.00,17,,,,,3,,,,0.0,2,31,54,True,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3272769,61.417200,-147.564900,13.10,1.20,18,,,,0.66,0,,0.40,,,0,5,10,False,6
3272770,66.227700,-157.202600,0.00,1.80,18,,,,1.01,0,,0.40,,,0,5,10,False,6
3272771,33.234667,-116.771167,12.45,0.65,18,19.0,54.0,0.01048,0.16,3,0.27,0.70,0.158,14.0,0,31,54,False,0
3272772,62.829900,-148.766400,55.50,1.80,18,,,,0.51,0,,1.60,,,0,5,10,False,6


We'll grab all rows that have fewer than 5 nulls to make life easier for our imputer (and prevent it from having to just guess).

In [8]:
quake_frame_imputation = quake_frame.loc[quake_frame['has_null'] < 5,:]

In [9]:
imp = IterativeImputer(max_iter=20, random_state=42)
imp_cols = quake_frame.columns.to_list()

quake_frame_imputation.loc[:, imp_cols] = imp.fit_transform(quake_frame_imputation.loc[:, imp_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [10]:
quake_frame_imputation.head()

Unnamed: 0,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,horizontalError,depthError,magError,magNst,status,locationSource,magSource,simple_label,has_null
7,46.2765,-118.36,-0.26,2.3,14.0,6.0,303.0,0.52,0.14,19.0,0.41,0.24,0.12,0.0,2.0,190.0,319.0,0.0,0.0
8,46.332833,-118.391167,-0.26,2.6,14.0,6.0,299.0,0.4814,0.15,19.0,0.144,0.09,0.21,0.0,2.0,190.0,319.0,0.0,0.0
9,32.707167,-115.417,6.0,2.75,17.0,4.0,214.0,0.6036,0.58,3.0,4.6,31.61,0.084,6.0,2.0,31.0,54.0,0.0,0.0
11,37.433333,-118.7435,6.0,3.69,18.0,10.0,211.0,0.549249,0.58,3.0,3.24,31.61,0.224,7.0,2.0,31.0,54.0,0.0,1.0
12,46.7495,-119.3715,1.869,1.5,14.0,6.0,93.0,0.1063,0.15,19.0,0.578,1.91,0.12,0.0,2.0,190.0,319.0,0.0,0.0


In [11]:
quake_frame_imputation.loc[:, col] = quake_frame_imputation[col].round(decimals=0)
quake_frame_imputation.loc[:, col] = quake_frame_imputation[col].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  quake_frame_imputation.loc[:, col] = quake_frame_imputation[col].round(decimals=0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  quake_frame_imputation.loc[:, col] = quake_frame_imputation[col].astype(int)


In [12]:
quake_frame_imputation.magType.unique()

array([ 14.,  17.,  18.,  24.,   8.,  13.,  15.,  23.,  16.,   1.,  12.,
        10.,  11.,  -2.,   7.,   5.,  -4.,   6.,   4.,   9.,  21.,  19.,
         2., -23.,  26.,  25.,  27.,  29.,  28.,   3.,   0.,  22.,  20.])

In [13]:
sum(quake_frame_imputation.simple_label)/len(quake_frame_imputation.index)

0.03235541948092627

Okay, now let's see how this works out. We'll grab the first 20% of the dataframe when order by has_nulls to get original data (this should be good enough for our purposes as a comparison, properly, it would have to be randomised and then selected to avoid any effects from the ordering).

In [14]:
quake_frame_imputation = quake_frame_imputation.sample(frac=1, random_state=42).reset_index(drop=True)

In [15]:
quake_frame_imputation.sort_values(by=['has_null'], inplace=True, ignore_index=True)

In [16]:
quake_frame_imputation.columns

Index(['latitude', 'longitude', 'depth', 'mag', 'magType', 'nst', 'gap',
       'dmin', 'rms', 'net', 'horizontalError', 'depthError', 'magError',
       'magNst', 'status', 'locationSource', 'magSource', 'simple_label',
       'has_null'],
      dtype='object')

In [17]:
x_cols = ['latitude',
 'longitude',
 'depth',
 'mag',
 'nst',
 'gap',
 'dmin',
 'rms',
 'horizontalError',
 'depthError',
 'magError',
 'magNst',
 'magType',
 'net',
 'status',
 'locationSource',
 'magSource']

y_col = ['simple_label']

In [18]:
# Separate train and valid sets and shuffle training set
valid_length = int(np.round(len(quake_frame_imputation.index) * 0.2))
quake_frame_imp_valid = quake_frame_imputation.loc[:valid_length, :]
quake_frame_imp_train = quake_frame_imputation.loc[valid_length:, :]
quake_frame_imp_train = quake_frame_imp_train.sample(frac=1, random_state=42).reset_index(drop=True)

In [19]:
valid_X = quake_frame_imp_valid.loc[:, x_cols]
valid_y = quake_frame_imp_valid.loc[:, y_col]

train_X = quake_frame_imp_train.loc[:, x_cols]
train_y = quake_frame_imp_train.loc[:, y_col]

In [20]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [21]:
rfc.fit(train_X, np.ravel(train_y))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [22]:
from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, accuracy_score
    )

In [23]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

In [24]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9648824887922837
Recall:  0.8823529411764706
ROC score:  0.9405852729155186
F1 score:  0.9217741150514259
Accuracy score:  0.994681777975811
