### Download dataset from https://www.kaggle.com/usgs/earthquake-database . Clean the data and predict magnitude using any tree-based method. Quality metric: 2-1 croosvalidated mean squared error. Visualise your results

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,Imputer
from sklearn.metrics import mean_squared_error

df = pd.read_csv('./database.csv')
df.head(10)

Unnamed: 0,Date,Time,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,MagnitudeType,...,Magnitude Seismic Stations,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,ID,Source,Location Source,Magnitude Source,Status
0,01/02/1965,13:44:18,19.246,145.616,Earthquake,131.6,,,6.0,MW,...,,,,,,ISCGEM860706,ISCGEM,ISCGEM,ISCGEM,Automatic
1,01/04/1965,11:29:49,1.863,127.352,Earthquake,80.0,,,5.8,MW,...,,,,,,ISCGEM860737,ISCGEM,ISCGEM,ISCGEM,Automatic
2,01/05/1965,18:05:58,-20.579,-173.972,Earthquake,20.0,,,6.2,MW,...,,,,,,ISCGEM860762,ISCGEM,ISCGEM,ISCGEM,Automatic
3,01/08/1965,18:49:43,-59.076,-23.557,Earthquake,15.0,,,5.8,MW,...,,,,,,ISCGEM860856,ISCGEM,ISCGEM,ISCGEM,Automatic
4,01/09/1965,13:32:50,11.938,126.427,Earthquake,15.0,,,5.8,MW,...,,,,,,ISCGEM860890,ISCGEM,ISCGEM,ISCGEM,Automatic
5,01/10/1965,13:36:32,-13.405,166.629,Earthquake,35.0,,,6.7,MW,...,,,,,,ISCGEM860922,ISCGEM,ISCGEM,ISCGEM,Automatic
6,01/12/1965,13:32:25,27.357,87.867,Earthquake,20.0,,,5.9,MW,...,,,,,,ISCGEM861007,ISCGEM,ISCGEM,ISCGEM,Automatic
7,01/15/1965,23:17:42,-13.309,166.212,Earthquake,35.0,,,6.0,MW,...,,,,,,ISCGEM861111,ISCGEM,ISCGEM,ISCGEM,Automatic
8,01/16/1965,11:32:37,-56.452,-27.043,Earthquake,95.0,,,6.0,MW,...,,,,,,ISCGEMSUP861125,ISCGEMSUP,ISCGEM,ISCGEM,Automatic
9,01/17/1965,10:43:17,-24.563,178.487,Earthquake,565.0,,,5.8,MW,...,,,,,,ISCGEM861148,ISCGEM,ISCGEM,ISCGEM,Automatic


In [2]:
df.count()

Date                          23412
Time                          23412
Latitude                      23412
Longitude                     23412
Type                          23412
Depth                         23412
Depth Error                    4461
Depth Seismic Stations         7097
Magnitude                     23412
MagnitudeType                 23409
Magnitude Error                 327
Magnitude Seismic Stations     2564
Azimuthal Gap                  7299
Horizontal Distance            1604
Horizontal Error               1156
Root Mean Square              17352
ID                            23412
Source                        23412
Location Source               23412
Magnitude Source              23412
Status                        23412
dtype: int64

In [3]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
columns_to_impute = [
    'Depth Error',
    'Depth Seismic Stations',
    'Magnitude Error',
    'Magnitude Seismic Stations',
    'Azimuthal Gap',
    'Horizontal Distance',
    'Horizontal Error',
    'Root Mean Square'
]

df[columns_to_impute] = imp.fit_transform(df[columns_to_impute])

lat = df['Latitude']
lon = df['Longitude']

df['x'] = np.cos(lat) * np.cos(lon)
df['y'] = np.cos(lat) * np.sin(lon)
df['z'] = np.sin(lat)

df.drop(columns=['Latitude', 'Longitude'])

df.head(10)

Unnamed: 0,Date,Time,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,MagnitudeType,...,Horizontal Error,Root Mean Square,ID,Source,Location Source,Magnitude Source,Status,x,y,z
0,01/02/1965,13:44:18,19.246,145.616,Earthquake,131.6,4.993115,275.364098,6.0,MW,...,7.662759,1.022784,ISCGEM860706,ISCGEM,ISCGEM,ISCGEM,Automatic,0.416163,0.823228,0.386141
1,01/04/1965,11:29:49,1.863,127.352,Earthquake,80.0,4.993115,275.364098,5.8,MW,...,7.662759,1.022784,ISCGEM860737,ISCGEM,ISCGEM,ISCGEM,Automatic,0.033769,-0.286077,0.957611
2,01/05/1965,18:05:58,-20.579,-173.972,Earthquake,20.0,4.993115,275.364098,6.2,MW,...,7.662759,1.022784,ISCGEM860762,ISCGEM,ISCGEM,ISCGEM,Automatic,0.059536,-0.146336,-0.987442
3,01/08/1965,18:49:43,-59.076,-23.557,Earthquake,15.0,4.993115,275.364098,5.8,MW,...,7.662759,1.022784,ISCGEM860856,ISCGEM,ISCGEM,ISCGEM,Automatic,0.004041,-0.81719,-0.576354
4,01/09/1965,13:32:50,11.938,126.427,Earthquake,15.0,4.993115,275.364098,5.8,MW,...,7.662759,1.022784,ISCGEM860890,ISCGEM,ISCGEM,ISCGEM,Automatic,0.584544,0.559256,-0.587827
5,01/10/1965,13:36:32,-13.405,166.629,Earthquake,35.0,4.993115,275.364098,6.7,MW,...,7.662759,1.022784,ISCGEM860922,ISCGEM,ISCGEM,ISCGEM,Automatic,-0.663301,-0.083071,-0.743728
6,01/12/1965,13:32:25,27.357,87.867,Earthquake,20.0,4.993115,275.364098,5.9,MW,...,7.662759,1.022784,ISCGEM861007,ISCGEM,ISCGEM,ISCGEM,Automatic,-0.605046,0.059237,0.793984
7,01/15/1965,23:17:42,-13.309,166.212,Earthquake,35.0,4.993115,275.364098,6.0,MW,...,7.662759,1.022784,ISCGEM861111,ISCGEM,ISCGEM,ISCGEM,Automatic,-0.705422,0.21236,-0.676227
8,01/16/1965,11:32:37,-56.452,-27.043,Earthquake,95.0,4.993115,275.364098,6.0,MW,...,7.662759,1.022784,ISCGEMSUP861125,ISCGEMSUP,ISCGEM,ISCGEM,Automatic,-0.331426,-0.938532,0.096517
9,01/17/1965,10:43:17,-24.563,178.487,Earthquake,565.0,4.993115,275.364098,5.8,MW,...,7.662759,1.022784,ISCGEM861148,ISCGEM,ISCGEM,ISCGEM,Automatic,-0.702586,0.464118,0.539414


In [4]:
columns_to_encode = [
    'Type',
    'MagnitudeType',
    'Source',
    'Location Source',
    'Magnitude Source',
    'Status'
]

label_encoders = []

for column in columns_to_encode:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column].fillna('0'))
    label_encoders.append(le)

df = df.drop(columns=['ID', 'Date', 'Time'])
df.head(10)

Unnamed: 0,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,MagnitudeType,Magnitude Error,Magnitude Seismic Stations,...,Horizontal Distance,Horizontal Error,Root Mean Square,Source,Location Source,Magnitude Source,Status,x,y,z
0,19.246,145.616,0,131.6,4.993115,275.364098,6.0,6,0.07182,48.944618,...,3.99266,7.662759,1.022784,4,20,11,0,0.416163,0.823228,0.386141
1,1.863,127.352,0,80.0,4.993115,275.364098,5.8,6,0.07182,48.944618,...,3.99266,7.662759,1.022784,4,20,11,0,0.033769,-0.286077,0.957611
2,-20.579,-173.972,0,20.0,4.993115,275.364098,6.2,6,0.07182,48.944618,...,3.99266,7.662759,1.022784,4,20,11,0,0.059536,-0.146336,-0.987442
3,-59.076,-23.557,0,15.0,4.993115,275.364098,5.8,6,0.07182,48.944618,...,3.99266,7.662759,1.022784,4,20,11,0,0.004041,-0.81719,-0.576354
4,11.938,126.427,0,15.0,4.993115,275.364098,5.8,6,0.07182,48.944618,...,3.99266,7.662759,1.022784,4,20,11,0,0.584544,0.559256,-0.587827
5,-13.405,166.629,0,35.0,4.993115,275.364098,6.7,6,0.07182,48.944618,...,3.99266,7.662759,1.022784,4,20,11,0,-0.663301,-0.083071,-0.743728
6,27.357,87.867,0,20.0,4.993115,275.364098,5.9,6,0.07182,48.944618,...,3.99266,7.662759,1.022784,4,20,11,0,-0.605046,0.059237,0.793984
7,-13.309,166.212,0,35.0,4.993115,275.364098,6.0,6,0.07182,48.944618,...,3.99266,7.662759,1.022784,4,20,11,0,-0.705422,0.21236,-0.676227
8,-56.452,-27.043,0,95.0,4.993115,275.364098,6.0,6,0.07182,48.944618,...,3.99266,7.662759,1.022784,5,20,11,0,-0.331426,-0.938532,0.096517
9,-24.563,178.487,0,565.0,4.993115,275.364098,5.8,6,0.07182,48.944618,...,3.99266,7.662759,1.022784,4,20,11,0,-0.702586,0.464118,0.539414


In [5]:
X = df.loc[:, df.columns != 'Magnitude'].values
y = df['Magnitude'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

model = xgb.XGBRegressor()
model.fit(X=X_train, y=y_train)

mean_squared_error(y_test, model.predict(X_test))

0.12766451828925734