## Build Model with Reduced Feature Set

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [5]:
# Import scaled earthquake data with reduced feature set
df = pd.read_csv('Resources/earthquake_data_reduced.csv')
display(df.head())
df.info()

Unnamed: 0,longitude,latitude,depth,magnitude,min_station_distance,nodal_plane_1_dip,nodal_plane_1_strike,nodal_plane_2_dip,nodal_plane_2_rake,nodal_plane_2_strike,...,t_axis_length,t_axis_plunge,percent_double_couple,scalar_moment,tensor_mpp,tensor_mrp,tensor_mrr,tensor_mrt,tensor_mtt,mmi_class
0,-0.858789,0.217395,-0.138401,1.259068,-0.621166,0.764993,-0.187953,-3.27598,1.009308,1.075972,...,-0.05986,3.288087,-0.131701,-0.058242,-0.053888,-0.019118,-0.003759,-0.020843,0.055255,2
1,-1.586411,1.453026,2.957343,0.278647,0.182939,-0.322804,-0.143028,0.718901,-0.131508,-1.479373,...,-0.066207,0.813702,-0.016844,-0.065891,-0.050703,-0.004675,-0.031174,-0.034918,0.059791,0
2,-0.451425,1.705352,1.29798,0.8669,2.000126,-0.866703,-0.187953,-1.531736,-0.587834,1.217936,...,-0.062811,-0.542928,-0.045558,-0.062687,-0.048152,0.025176,-0.045815,-0.028006,0.06136,0
3,-0.891745,0.63995,-0.776489,0.808074,-0.566141,-0.594754,0.620696,0.775167,-1.22479,1.132758,...,-0.06276,-0.274021,-0.138879,-0.063113,-0.047294,-0.015576,-0.037577,-0.028875,0.058169,0
4,0.049063,0.91787,-0.567035,0.8669,1.537912,-0.594754,1.56412,0.775167,-0.179042,0.309369,...,-0.062428,-0.128764,-0.009665,-0.062141,-0.048454,0.008477,-0.03479,-0.019123,0.05855,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957 entries, 0 to 956
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   longitude              957 non-null    float64
 1   latitude               957 non-null    float64
 2   depth                  957 non-null    float64
 3   magnitude              957 non-null    float64
 4   min_station_distance   957 non-null    float64
 5   nodal_plane_1_dip      957 non-null    float64
 6   nodal_plane_1_strike   957 non-null    float64
 7   nodal_plane_2_dip      957 non-null    float64
 8   nodal_plane_2_rake     957 non-null    float64
 9   nodal_plane_2_strike   957 non-null    float64
 10  n_axis_length          957 non-null    float64
 11  n_axis_plunge          957 non-null    float64
 12  p_axis_azimuth         957 non-null    float64
 13  p_axis_plunge          957 non-null    float64
 14  t_axis_azimuth         957 non-null    float64
 15  t_axis

In [6]:
# Create X and y
X = df.drop(columns='mmi_class', axis=1)
# y = df['mmi_class'].values.reshape(-1, 1)
y = df['mmi_class']
display(X.head())
display(y[:5])

Unnamed: 0,longitude,latitude,depth,magnitude,min_station_distance,nodal_plane_1_dip,nodal_plane_1_strike,nodal_plane_2_dip,nodal_plane_2_rake,nodal_plane_2_strike,...,t_axis_azimuth,t_axis_length,t_axis_plunge,percent_double_couple,scalar_moment,tensor_mpp,tensor_mrp,tensor_mrr,tensor_mrt,tensor_mtt
0,-0.858789,0.217395,-0.138401,1.259068,-0.621166,0.764993,-0.187953,-3.27598,1.009308,1.075972,...,-1.568777,-0.05986,3.288087,-0.131701,-0.058242,-0.053888,-0.019118,-0.003759,-0.020843,0.055255
1,-1.586411,1.453026,2.957343,0.278647,0.182939,-0.322804,-0.143028,0.718901,-0.131508,-1.479373,...,-0.890918,-0.066207,0.813702,-0.016844,-0.065891,-0.050703,-0.004675,-0.031174,-0.034918,0.059791
2,-0.451425,1.705352,1.29798,0.8669,2.000126,-0.866703,-0.187953,-1.531736,-0.587834,1.217936,...,0.611749,-0.062811,-0.542928,-0.045558,-0.062687,-0.048152,0.025176,-0.045815,-0.028006,0.06136
3,-0.891745,0.63995,-0.776489,0.808074,-0.566141,-0.594754,0.620696,0.775167,-1.22479,1.132758,...,-0.88543,-0.06276,-0.274021,-0.138879,-0.063113,-0.047294,-0.015576,-0.037577,-0.028875,0.058169
4,0.049063,0.91787,-0.567035,0.8669,1.537912,-0.594754,1.56412,0.775167,-0.179042,0.309369,...,1.247287,-0.062428,-0.128764,-0.009665,-0.062141,-0.048454,0.008477,-0.03479,-0.019123,0.05855


0    2
1    0
2    0
3    0
4    1
Name: mmi_class, dtype: int64

In [7]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Apply Randomized Search of Best Hyperparameters

In [8]:
# Define a hyperparameter grid
param_grid = {'n_estimators': range(100, 1000, 10),
              'max_features': ['sqrt', 'log2'],
              'max_depth': range(1, 40, 2),
              'min_samples_split': [2, 3, 4, 5],
              'min_samples_leaf': [1, 2, 4],
              'bootstrap': [True, False]}

# Instantiate a Random Forest Classifier model
rfc = RandomForestClassifier(random_state=0)

In [9]:
# Instantiate the Randomized Search Estimator
random_rfc = RandomizedSearchCV(rfc, param_grid, random_state=0, verbose=3)

# Fit the Randomized Search Estimator on train data
random_rfc.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END bootstrap=False, max_depth=23, max_features=log2, min_samples_leaf=1, min_samples_split=4, n_estimators=580;, score=0.597 total time=   1.4s
[CV 2/5] END bootstrap=False, max_depth=23, max_features=log2, min_samples_leaf=1, min_samples_split=4, n_estimators=580;, score=0.542 total time=   1.2s
[CV 3/5] END bootstrap=False, max_depth=23, max_features=log2, min_samples_leaf=1, min_samples_split=4, n_estimators=580;, score=0.643 total time=   1.4s
[CV 4/5] END bootstrap=False, max_depth=23, max_features=log2, min_samples_leaf=1, min_samples_split=4, n_estimators=580;, score=0.657 total time=   1.2s
[CV 5/5] END bootstrap=False, max_depth=23, max_features=log2, min_samples_leaf=1, min_samples_split=4, n_estimators=580;, score=0.678 total time=   1.2s
[CV 1/5] END bootstrap=False, max_depth=1, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=170;, score=0.514 total time=   0.1s
[CV 2/5] END boo

In [10]:
# Print best hyperparameters
print(random_rfc.best_params_)

{'n_estimators': 210, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 35, 'bootstrap': False}


In [11]:
# Make predictions with the hypertuned model
random_rfc_train_predict = random_rfc.predict(X_train)
random_rfc_test_predict = random_rfc.predict(X_test)

# Calculate and print the balanced accuracies
print(f"Accuracy - Train: {balanced_accuracy_score(y_train, random_rfc_train_predict):.3f}")
print(f"Accuracy - Test: {balanced_accuracy_score(y_test, random_rfc_test_predict):.3f}")

# Calculate the classification report
print(classification_report(y_test, random_rfc_test_predict))

Accuracy - Train: 1.000
Accuracy - Test: 0.576
              precision    recall  f1-score   support

           0       0.65      0.71      0.68        89
           1       0.43      0.45      0.44        83
           2       0.68      0.57      0.62        68

    accuracy                           0.58       240
   macro avg       0.59      0.58      0.58       240
weighted avg       0.58      0.58      0.58       240



### Refine Hyperparameter Set Using Grid Search

In [12]:
# Define a new hyperparameter grid around the best hyperparameters from before
param_grid = {'n_estimators': range(200, 220),
              'max_features': ['sqrt'],
              'max_depth': range(33, 37),
              'min_samples_split': [4],
              'min_samples_leaf': [2],
              'bootstrap': [False]}

# Instantiate a Random Forest Classifier model
rfc2 = RandomForestClassifier(random_state=0)

In [13]:
# Use a grid search this time.random_rfc2.random_rfc2
# Instantiate the Grid Search Estimator
grid_rfc2 = GridSearchCV(rfc2, param_grid, verbose=3)

# Fit the Randomized Search Estimator on train data
grid_rfc2.fit(X_train, y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV 1/5] END bootstrap=False, max_depth=33, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200;, score=0.597 total time=   0.5s
[CV 2/5] END bootstrap=False, max_depth=33, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200;, score=0.576 total time=   0.3s
[CV 3/5] END bootstrap=False, max_depth=33, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200;, score=0.643 total time=   0.4s
[CV 4/5] END bootstrap=False, max_depth=33, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200;, score=0.685 total time=   0.3s
[CV 5/5] END bootstrap=False, max_depth=33, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200;, score=0.678 total time=   0.3s
[CV 1/5] END bootstrap=False, max_depth=33, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=201;, score=0.597 total time=   0.3s
[CV 2/5] END b

In [14]:
# Print best hyperparameters
print(grid_rfc2.best_params_)

{'bootstrap': False, 'max_depth': 33, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 203}


In [15]:
# Make predictions with the hypertuned model
grid_rfc2_train_predict = grid_rfc2.predict(X_train)
grid_rfc2_test_predict = grid_rfc2.predict(X_test)

# Calculate and print the balanced accuracies
print(f"Accuracy - Train: {balanced_accuracy_score(y_train, grid_rfc2_train_predict):.3f}")
print(f"Accuracy - Test: {balanced_accuracy_score(y_test, grid_rfc2_test_predict):.3f}")

# Calculate the classification report
print(classification_report(y_test, grid_rfc2_test_predict))

Accuracy - Train: 1.000
Accuracy - Test: 0.588
              precision    recall  f1-score   support

           0       0.66      0.72      0.69        89
           1       0.45      0.47      0.46        83
           2       0.70      0.57      0.63        68

    accuracy                           0.59       240
   macro avg       0.60      0.59      0.59       240
weighted avg       0.60      0.59      0.59       240



**Conclusion:** This model performs worse than the model built on all of the features. Not only, is the accuracy score for the test data smaller, it is also not improving the overfitting on the train data.