DATASET1 XGBOOST

In [1]:

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
%run MLProject.ipynb
df = pd.read_csv('usgs_main.csv')
df1 = df.copy()
df1['time'] = pd.to_datetime(df1['time'])
df1 = df1.sort_values('time')
df1 = df1.dropna(subset=['latitude','longitude','depth','mag','time'])

dfweek_xgb = df1.set_index('time').resample('W').apply({
    'mag':'mean',
    'latitude':'mean',
    'longitude':'mean',
    'depth':'mean',    #magType type vesairenin düzenlenmesinde yapay zekadan yararlandım
    'magType': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else x.iloc[0],
    'type': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else x.iloc[0],
    'status': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else x.iloc[0],
    'net': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else x.iloc[0]
})

dfweek_xgb = dfweek_xgb.reset_index(drop=True)
dfweek_xgb.index = dfweek_xgb.index + 1
dfweek_xgb.index.name = 'timeindex'

dfweek_xgb['futuremag'] = dfweek_xgb['mag'].shift(-1)
dfweek_xgb['futuredepth'] = dfweek_xgb['depth'].shift(-1)
dfweek_xgb['futurelat'] = dfweek_xgb['latitude'].shift(-1)
dfweek_xgb['futurelon'] = dfweek_xgb['longitude'].shift(-1)
dfweek_xgb = dfweek_xgb.dropna(subset=['futuremag','futuredepth','futurelat','futurelon'])

xgb_hiperparametreler = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'min_child_weight': [1, 3, 5],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2],
    'shiftnum': [2, 3, 4]
}

train_xgb = dfweek_xgb[:int((4*len(dfweek_xgb))/5)]
test_xgb = dfweek_xgb[int(4*len(dfweek_xgb)/5):]

splitter_xgb = TimeSeriesSplit(n_splits=3)
grid_xgb = GridSearchCV(
    estimator=XGBoostEarthquakePredictor(random_state=42),
    param_grid=xgb_hiperparametreler,
    cv=splitter_xgb,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

grid_xgb.fit(train_xgb)
best_model_xgb = grid_xgb.best_estimator_

test_predictions_xgb = best_model_xgb.predict(test_xgb)
test_actual_xgb = test_xgb[['futuremag', 'futuredepth', 'futurelat', 'futurelon']].dropna()
min_len_xgb = min(len(test_predictions_xgb), len(test_actual_xgb))

target_names = ['futuremag', 'futuredepth', 'futurelat', 'futurelon']
target_labels = ['Magnitude', 'Depth', 'Latitude', 'Longitude']

print("Dataset 1 XGBoost Sonuçları:")  #buradaki sonuç yazdırma kısmında yapay zekadan benim için mse mae r2 testlerini yazdırmasını istedim
for i, (name, label) in enumerate(zip(target_names, target_labels)):
    if i < test_predictions_xgb.shape[1] and i < test_actual_xgb.shape[1]:
        target_mse = mean_squared_error(
            test_actual_xgb.iloc[:min_len_xgb, i], 
            test_predictions_xgb[:min_len_xgb, i]
        )
        target_mae = mean_absolute_error(
            test_actual_xgb.iloc[:min_len_xgb, i], 
            test_predictions_xgb[:min_len_xgb, i]
        )
        target_r2 = r2_score(
            test_actual_xgb.iloc[:min_len_xgb, i], 
            test_predictions_xgb[:min_len_xgb, i]
        )
        
        print(f"   {label:12}: MSE={target_mse:.3f}, MAE={target_mae:.3f}, R²={target_r2:.3f}")


Fitting 3 folds for each of 26244 candidates, totalling 78732 fits
Dataset 1 XGBoost Sonuçları:
   Magnitude   : MSE=0.058, MAE=0.217, R²=-5.052
   Depth       : MSE=14.691, MAE=3.054, R²=-1.431
   Latitude    : MSE=1.559, MAE=0.982, R²=-0.012
   Longitude   : MSE=18.600, MAE=3.145, R²=-0.013




In [None]:
import joblib
import json   #joblib ve json ile işlem için yapay zeka yardımına başvurdum
joblib.dump(best_model_xgb, 'models/xgboost_dataset1.pkl')
xgb_best_params = {
    'dataset1_params': grid_xgb.best_params_
}
print(grid_xgb.best_params_)

{'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1, 'shiftnum': 2, 'subsample': 0.8}


XGBOOST DATASET 2

In [3]:
pn = pd.read_csv('Significant Earthquake Dataset 1900-2023.csv')
df2 = pn.copy()
df2 = df2.rename(columns={
    'Time':'time', 'Mag':'mag', 'Depth':'depth', 
    'Latitude':'latitude', 'Longitude':'longitude'
})

df2['time'] = pd.to_datetime(df2['time'])
df2 = df2.sort_values('time')
df2 = df2.dropna(subset=['latitude','longitude','depth','mag','time'])


dfyear_xgb = df2.set_index('time').resample('YE').apply({
    'mag':'mean',
    'latitude':'mean',
    'longitude':'mean',
    'depth':'mean',  #Yapay zekadan 1. datasette one hot encode verileri için yardım almıştım,burada da aynı yöntemi kullandım
    'MagType': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 and len(x) > 0 else 'Unknown',
    'Type': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 and len(x) > 0 else 'Unknown',
    'status': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 and len(x) > 0 else 'Unknown',
    'net': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 and len(x) > 0 else 'Unknown'
})

# Label uyuşmazlığı hatası alınca ekledim
dfyear_xgb = dfyear_xgb.rename(columns={'MagType':'magType','Type':'type'})

dfyear_xgb = dfyear_xgb.reset_index(drop=True)
dfyear_xgb.index = dfyear_xgb.index + 1
dfyear_xgb.index.name = 'timeindex'

dfyear_xgb['futuremag'] = dfyear_xgb['mag'].shift(-1)
dfyear_xgb['futuredepth'] = dfyear_xgb['depth'].shift(-1)
dfyear_xgb['futurelat'] = dfyear_xgb['latitude'].shift(-1)
dfyear_xgb['futurelon'] = dfyear_xgb['longitude'].shift(-1)
dfyear_xgb = dfyear_xgb.dropna(subset=['futuremag','futuredepth','futurelat','futurelon'])

xgb_hiperparametreler_2 = {
    'n_estimators': [200,320,400],
    'max_depth': [4,6,8,10],
    'learning_rate': [0.05,0.11,0.15],
    'subsample': [0.7,0.8,0.9],
    'colsample_bytree': [0.7,0.8,0.9],
    'min_child_weight': [1,2,3],
    'reg_alpha': [0,0.1,0.3],
    'reg_lambda': [1,1.3,1.5],
    'shiftnum': [3,4,5]
}

train_xgb_2 = dfyear_xgb[:int((4*len(dfyear_xgb))/5)]
test_xgb_2 = dfyear_xgb[int(4*len(dfyear_xgb)/5):]

splitter_xgb_2 = TimeSeriesSplit(n_splits=3)
grid_xgb_2 = GridSearchCV(
    estimator=XGBoostEarthquakePredictor(random_state=42),
    param_grid=xgb_hiperparametreler_2,
    cv=splitter_xgb_2,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

grid_xgb_2.fit(train_xgb_2)

best_model_xgb_2 = grid_xgb_2.best_estimator_

test_predictions_xgb_2 = best_model_xgb_2.predict(test_xgb_2)
test_actual_xgb_2 = test_xgb_2[['futuremag', 'futuredepth', 'futurelat', 'futurelon']].dropna()
min_len_xgb_2 = min(len(test_predictions_xgb_2), len(test_actual_xgb_2))


target_names = ['futuremag', 'futuredepth', 'futurelat', 'futurelon']
target_labels = ['Magnitude', 'Depth', 'Latitude', 'Longitude']

#Yine dataset 1 deki gibi sonuç yazdığımız kısım,aynı şekilde kullandım,sadece değişken isimleri farklı
for i, (name, label) in enumerate(zip(target_names, target_labels)):
    if i < test_predictions_xgb_2.shape[1] and i < test_actual_xgb_2.shape[1]:
        target_mse = mean_squared_error(
            test_actual_xgb_2.iloc[:min_len_xgb_2, i], 
            test_predictions_xgb_2[:min_len_xgb_2, i]
        )
        target_mae = mean_absolute_error(
            test_actual_xgb_2.iloc[:min_len_xgb_2, i], 
            test_predictions_xgb_2[:min_len_xgb_2, i]
        )
        target_r2 = r2_score(
            test_actual_xgb_2.iloc[:min_len_xgb_2, i], 
            test_predictions_xgb_2[:min_len_xgb_2, i]
        )
        
        print(f"   {label:12}: MSE={target_mse:.3f}, MAE={target_mae:.3f}, R²={target_r2:.3f}")


Fitting 3 folds for each of 26244 candidates, totalling 78732 fits




   Magnitude   : MSE=0.001, MAE=0.022, R²=-1.111
   Depth       : MSE=155.373, MAE=10.910, R²=-0.377
   Latitude    : MSE=16.833, MAE=3.190, R²=-0.443
   Longitude   : MSE=202.669, MAE=11.954, R²=-0.145


In [4]:
joblib.dump(best_model_xgb_2, 'models/xgboost_dataset2.pkl')
xgb_best_params['dataset2_params'] = grid_xgb_2.best_params_
print(grid_xgb_2.best_params_)
with open('models/xgboost_best_params.json', 'w') as f:
    json.dump(xgb_best_params, f, indent=2)

{'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 1, 'shiftnum': 3, 'subsample': 0.7}
