In [106]:
# title: title name given to the earthquake
# magnitude: The magnitude of the earthquake
# date_time: date and time
# cdi: The maximum reported intensity for the event range
# mmi: The maximum estimated instrumental intensity for the event
# alert: The alert level - “green”, “yellow”, “orange”, and “red”
# tsunami: "1" for events in oceanic regions and "0" otherwise
# sig: A number describing how significant the event is. Larger numbers indicate a more significant event. This value is determined on a number of factors, including: magnitude, maximum MMI, felt reports, and estimated impact
# net: The ID of a data contributor. Identifies the network considered to be the preferred source of information for this event.
# nst: The total number of seismic stations used to determine earthquake location.
# dmin: Horizontal distance from the epicenter to the nearest station
# gap: The largest azimuthal gap between azimuthally adjacent stations (in degrees). In general, the smaller this number, the more reliable is the calculated horizontal position of the earthquake. Earthquake locations in which the azimuthal gap exceeds 180 degrees typically have large location and depth uncertainties
# magType: The method or algorithm used to calculate the preferred magnitude for the event
# depth: The depth where the earthquake begins to rupture
# latitude / longitude: coordinate system by means of which the position or location of any place on Earth's surface can be determined and described
# location: location within the country
# continent: continent of the earthquake hit country
# country: affected country

In [107]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
# from datetime import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
# from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import LabelEncoder
from imblearn.pipeline import Pipeline
import xgboost as xgb

In [108]:
df=pd.read_csv('earthquake_data.csv')
df.head()

Unnamed: 0,title,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,location,continent,country
0,"M 7.0 - 18 km SW of Malango, Solomon Islands",7.0,22-11-2022 02:03,8,7,green,1,768,us,117,0.509,17.0,mww,14.0,-9.7963,159.596,"Malango, Solomon Islands",Oceania,Solomon Islands
1,"M 6.9 - 204 km SW of Bengkulu, Indonesia",6.9,18-11-2022 13:37,4,4,green,0,735,us,99,2.229,34.0,mww,25.0,-4.9559,100.738,"Bengkulu, Indonesia",,
2,M 7.0 -,7.0,12-11-2022 07:09,3,3,green,1,755,us,147,3.125,18.0,mww,579.0,-20.0508,-178.346,,Oceania,Fiji
3,"M 7.3 - 205 km ESE of Neiafu, Tonga",7.3,11-11-2022 10:48,5,5,green,1,833,us,149,1.865,21.0,mww,37.0,-19.2918,-172.129,"Neiafu, Tonga",,
4,M 6.6 -,6.6,09-11-2022 10:14,0,2,green,1,670,us,131,4.998,27.0,mww,624.464,-25.5948,178.278,,,


In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 19 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      782 non-null    object 
 1   magnitude  782 non-null    float64
 2   date_time  782 non-null    object 
 3   cdi        782 non-null    int64  
 4   mmi        782 non-null    int64  
 5   alert      415 non-null    object 
 6   tsunami    782 non-null    int64  
 7   sig        782 non-null    int64  
 8   net        782 non-null    object 
 9   nst        782 non-null    int64  
 10  dmin       782 non-null    float64
 11  gap        782 non-null    float64
 12  magType    782 non-null    object 
 13  depth      782 non-null    float64
 14  latitude   782 non-null    float64
 15  longitude  782 non-null    float64
 16  location   777 non-null    object 
 17  continent  206 non-null    object 
 18  country    484 non-null    object 
dtypes: float64(6), int64(5), object(8)
memory usage: 1

In [110]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
# from sklearn.pipeline import Pipeline

In [111]:
obj=df.select_dtypes(include=['object'])
obj.head()

Unnamed: 0,title,date_time,alert,net,magType,location,continent,country
0,"M 7.0 - 18 km SW of Malango, Solomon Islands",22-11-2022 02:03,green,us,mww,"Malango, Solomon Islands",Oceania,Solomon Islands
1,"M 6.9 - 204 km SW of Bengkulu, Indonesia",18-11-2022 13:37,green,us,mww,"Bengkulu, Indonesia",,
2,M 7.0 -,12-11-2022 07:09,green,us,mww,,Oceania,Fiji
3,"M 7.3 - 205 km ESE of Neiafu, Tonga",11-11-2022 10:48,green,us,mww,"Neiafu, Tonga",,
4,M 6.6 -,09-11-2022 10:14,green,us,mww,,,


In [112]:
obj.columns

Index(['title', 'date_time', 'alert', 'net', 'magType', 'location',
       'continent', 'country'],
      dtype='object')

In [113]:
int_=df.select_dtypes(include=['int','float'])

In [114]:
int_.columns

Index(['magnitude', 'cdi', 'mmi', 'tsunami', 'sig', 'nst', 'dmin', 'gap',
       'depth', 'latitude', 'longitude'],
      dtype='object')

In [115]:
int_.describe()

Unnamed: 0,magnitude,cdi,mmi,tsunami,sig,nst,dmin,gap,depth,latitude,longitude
count,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0
mean,6.941125,4.33376,5.964194,0.388747,870.108696,230.250639,1.325757,25.03899,75.883199,3.5381,52.609199
std,0.445514,3.169939,1.462724,0.487778,322.465367,250.188177,2.218805,24.225067,137.277078,27.303429,117.898886
min,6.5,0.0,1.0,0.0,650.0,0.0,0.0,0.0,2.7,-61.8484,-179.968
25%,6.6,0.0,5.0,0.0,691.0,0.0,0.0,14.625,14.0,-14.5956,-71.66805
50%,6.8,5.0,6.0,0.0,754.0,140.0,0.0,20.0,26.295,-2.5725,109.426
75%,7.1,7.0,7.0,1.0,909.75,445.0,1.863,30.0,49.75,24.6545,148.941
max,9.1,9.0,9.0,1.0,2910.0,934.0,17.654,239.0,670.81,71.6312,179.662


In [116]:
df.columns

Index(['title', 'magnitude', 'date_time', 'cdi', 'mmi', 'alert', 'tsunami',
       'sig', 'net', 'nst', 'dmin', 'gap', 'magType', 'depth', 'latitude',
       'longitude', 'location', 'continent', 'country'],
      dtype='object')

In [117]:
dict(df.iloc[2]).values()

dict_values(['M 7.0 - ', 7.0, '12-11-2022 07:09', 3, 3, 'green', 1, 755, 'us', 147, 3.125, 18.0, 'mww', 579.0, -20.0508, -178.346, nan, 'Oceania', 'Fiji'])

In [118]:
x=df.drop('tsunami',axis=1)
y=df['tsunami']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

# def extract_year_from_datetime(X):
#     X = pd.to_datetime(X,infer_datetime_format=True)
#     X = X.dt.year 
#     return X
# extract_year_from_datetime(df['date_time'])

# obj_trans = ColumnTransformer(
#     transformers=[
#         ('lable encode',LabelEncoder(),['magType']),('extract_year_from_datetime', FunctionTransformer(extract_year_from_datetime), [2])])
# int_trans = ColumnTransformer(
#     transformers=[
#         ('Scale', StandardScaler(),['magnitude', 'cdi', 'mmi', 'sig', 'nst', 'dmin', 'gap', 'depth',
#        'latitude', 'longitude'])],remainder='passthrough')
# pipeline = Pipeline([
#     ('obj_trans', obj_trans),
#     ('int_trans', int_trans),
#     ('xg',xgb.XGBRegressor())
# ])
# pipeline.fit(x_train,y_train)

In [119]:
# [1,3,4,7,9,10,11,13,14,15]

In [120]:
mapping = {
    'mww': 5,
    'mwc': 4,
    'mwb': 3,
    'mw': 2,
    'Mi': 1,
    'ms': 1,
    'mb': 1,
    'md': 1,
    'ml': 1
}
def map_function(column):
    for key, value in mapping.items():
        column = column.replace(key, str(value))
    return column.astype('int')

def drop_columns(df, columns):
    return df.drop(columns=columns)

obj_trans = ColumnTransformer([
        ('map_transformer', FunctionTransformer(map_function),['magType']),
('drop_transformer', FunctionTransformer(drop_columns, kw_args={'columns': ['title', 'date_time', 'alert', 'net', 'location','continent', 'country']}), ['title', 'date_time', 'alert', 'net', 'location','continent', 'country']),
],remainder='passthrough')

# int_trans = ColumnTransformer([('Scale', StandardScaler(),
#                                 ['magnitude', 'cdi', 'mmi', 'sig', 'nst', 'dmin', 'gap',
#        'depth', 'latitude', 'longitude']) ])

pipeline = Pipeline([
    ('obj_trans', obj_trans),
#     ('int_trans', int_trans),
    ('sample',SMOTE()),
    ('xg',xgb.XGBClassifier(learning_rate= 0.01, max_depth= 5, n_estimators= 300))])

pipeline.fit(x_train,y_train)


In [121]:
p=pipeline.predict(x_test)
print(classification_report(y_test,p))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94       104
           1       0.87      0.91      0.89        53

    accuracy                           0.92       157
   macro avg       0.91      0.92      0.92       157
weighted avg       0.92      0.92      0.92       157



In [122]:
import pickle
pipeline_with_functions = {
    'pipeline': pipeline,
    'mapping': mapping,
    'map_function': map_function,
    'drop_columns': drop_columns
}

with open('pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline_with_functions, f)

In [123]:

# pickle.dump(pipeline,open('pipe.pkl','wb'))

In [124]:
df.columns

Index(['title', 'magnitude', 'date_time', 'cdi', 'mmi', 'alert', 'tsunami',
       'sig', 'net', 'nst', 'dmin', 'gap', 'magType', 'depth', 'latitude',
       'longitude', 'location', 'continent', 'country'],
      dtype='object')

In [125]:
# model = pickle.load(open('pipe.pkl','rb'))
# dd= pd.DataFrame([['M 7.0 - 18 km SW of Malango, Solomon Islands', 7.0, '22-11-2022 02:03', 8, 7, 'green', 768, 'us', 117, 0.509, 17.0, 'mww', 14.0, -9.7963, 159.596, 'Malango, Solomon Islands', 'Oceania', 'Solomon Islands']], columns=['title', 'magnitude', 'date_time', 'cdi', 'mmi', 'alert','sig', 'net', 'nst', 'dmin', 'gap', 'magType', 'depth', 'latitude','longitude', 'location', 'continent', 'country'])
# model.predict(dd)

In [126]:
# testing=x_test.iloc[0:5][['magnitude', 'cdi', 'mmi','sig', 'nst', 'dmin', 'gap', 'magType', 'depth', 'latitude','longitude']]
# testing.to_excel("testsamples.xlsx", index=False)