In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error

# Load the dataset
file_path = "earthquake_1995-2023.csv"
df = pd.read_csv(file_path)

# Selecting features and target variable
target = "magnitude"
features = ["latitude", "longitude", "depth", "cdi", "mmi", "tsunami", "sig", "dmin", "gap", "nst", "magType"]

# Drop rows with missing target
df = df.dropna(subset=[target])

# Fill missing numerical values with median
for col in ["cdi", "mmi", "sig", "dmin", "gap", "nst"]:
    df[col] = df[col].fillna(df[col].median())

# One-hot encode categorical features
categorical_features = ["magType"]
numerical_features = list(set(features) - set(categorical_features))

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# Split the dataset
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")


Mean Absolute Error: 0.1033160000000015


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, r2_score

# Load the dataset
file_path = "earthquake_1995-2023.csv"
df = pd.read_csv(file_path)

# Selecting features and target variable
target = "magnitude"
features = ["latitude", "longitude", "depth", "cdi", "mmi", "tsunami", "sig", "dmin", "gap", "nst", "magType"]

# Drop rows with missing target
df = df.dropna(subset=[target])

# Fill missing numerical values with median
for col in ["cdi", "mmi", "sig", "dmin", "gap", "nst"]:
    df[col] = df[col].fillna(df[col].median())

# One-hot encode categorical features
categorical_features = ["magType"]
numerical_features = list(set(features) - set(categorical_features))

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# Split the dataset
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")
print(f"R-squared Score: {r2}")

# Display actual vs predicted values
results = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
print(results.head(20))


Mean Absolute Error: 0.1033160000000015
R-squared Score: 0.7820859449354391
    Actual  Predicted
0      7.8      7.448
1      6.8      6.798
2      6.6      6.599
3      6.6      6.597
4      6.8      6.866
5      7.6      7.183
6      7.9      7.418
7      6.7      6.747
8      6.9      6.900
9      6.6      6.599
10     6.9      6.899
11     7.3      7.132
12     6.8      6.952
13     7.9      7.882
14     6.9      6.897
15     6.6      6.600
16     7.2      7.153
17     6.6      6.600
18     6.5      6.500
19     7.0      6.812


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, r2_score

# Load the dataset
file_path = "earthquake_1995-2023.csv"
df = pd.read_csv(file_path)

# Selecting features and target variable
target = "magnitude"
features = ["latitude", "longitude", "depth", "cdi", "mmi", "tsunami", "sig", "dmin", "gap", "nst", "magType"]

# Drop rows with missing target
df = df.dropna(subset=[target])

# Fill missing numerical values with median
for col in ["cdi", "mmi", "sig", "dmin", "gap", "nst"]:
    df[col] = df[col].fillna(df[col].median())

# One-hot encode categorical features
categorical_features = ["magType"]
numerical_features = list(set(features) - set(categorical_features))

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# Split the dataset
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define different models
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "Linear Regression": LinearRegression(),
    "K-Nearest Neighbors": KNeighborsRegressor(n_neighbors=5),
    "Support Vector Machine": SVR()
}

# Train and evaluate each model
results = {}
for name, regressor in models.items():
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MAE": mae, "R2 Score": r2}
    print(f"{name} - Mean Absolute Error: {mae}, R-squared Score: {r2}")

# Print actual vs predicted values for the best model (Random Forest as default)
best_model_name = max(results, key=lambda x: results[x]["R2 Score"])
best_model = models[best_model_name]
best_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", best_model)
])
best_pipeline.fit(X_train, y_train)
y_pred = best_pipeline.predict(X_test)
print(f"Best Model: {best_model_name}")

final_results = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
print(final_results.head(20))


Random Forest - Mean Absolute Error: 0.1033160000000015, R-squared Score: 0.7820859449354391
Gradient Boosting - Mean Absolute Error: 0.11528671279321731, R-squared Score: 0.7632242485596796
Linear Regression - Mean Absolute Error: 0.2885600527291944, R-squared Score: 0.212210710487308
K-Nearest Neighbors - Mean Absolute Error: 0.28356000000000003, R-squared Score: 0.26261590569135196
Support Vector Machine - Mean Absolute Error: 0.16319767777216068, R-squared Score: 0.6541413368002252
Best Model: Random Forest
    Actual  Predicted
0      7.8      7.448
1      6.8      6.798
2      6.6      6.599
3      6.6      6.597
4      6.8      6.866
5      7.6      7.183
6      7.9      7.418
7      6.7      6.747
8      6.9      6.900
9      6.6      6.599
10     6.9      6.899
11     7.3      7.132
12     6.8      6.952
13     7.9      7.882
14     6.9      6.897
15     6.6      6.600
16     7.2      7.153
17     6.6      6.600
18     6.5      6.500
19     7.0      6.812


In [3]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, r2_score

# Load the dataset
file_path = "earthquake_1995-2023.csv"
df = pd.read_csv(file_path)

# Selecting features and target variable
target = "magnitude"
features = ["latitude", "longitude", "depth", "cdi", "mmi", "tsunami", "sig", "dmin", "gap", "nst", "magType"]

# Drop rows with missing target
df = df.dropna(subset=[target])

# Fill missing numerical values with median
for col in ["cdi", "mmi", "sig", "dmin", "gap", "nst"]:
    df[col] = df[col].fillna(df[col].median())

# One-hot encode categorical features
categorical_features = ["magType"]
numerical_features = list(set(features) - set(categorical_features))

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# Split the dataset
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define different models
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "Linear Regression": LinearRegression(),
    "K-Nearest Neighbors": KNeighborsRegressor(n_neighbors=5),
    "Support Vector Machine": SVR()
}

# Train and evaluate each model
results = {}
for name, regressor in models.items():
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MAE": mae, "R2 Score": r2}
    print(f"{name} - Mean Absolute Error: {mae}, R-squared Score: {r2}")

# Select the best model
best_model_name = max(results, key=lambda x: results[x]["R2 Score"])
best_model = models[best_model_name]
best_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", best_model)
])
best_pipeline.fit(X_train, y_train)
y_pred = best_pipeline.predict(X_test)
print(f"Best Model: {best_model_name}")

final_results = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
print(final_results.head(20))

# Save the best model
joblib.dump(best_pipeline, "earthquake_model.pkl")

Random Forest - Mean Absolute Error: 0.10328900000000169, R-squared Score: 0.7862192530188004
Gradient Boosting - Mean Absolute Error: 0.11502069325002197, R-squared Score: 0.76437132473005
Linear Regression - Mean Absolute Error: 0.28856005272919427, R-squared Score: 0.21221071048730833
K-Nearest Neighbors - Mean Absolute Error: 0.28356000000000003, R-squared Score: 0.26261590569135196
Support Vector Machine - Mean Absolute Error: 0.16319767777216068, R-squared Score: 0.6541413368002256
Best Model: Random Forest
    Actual  Predicted
0      7.8     7.4610
1      6.8     6.7980
2      6.6     6.5980
3      6.6     6.5960
4      6.8     6.8760
5      7.6     7.2200
6      7.9     7.4170
7      6.7     6.7460
8      6.9     6.8990
9      6.6     6.6000
10     6.9     6.8970
11     7.3     7.1100
12     6.8     6.9640
13     7.9     7.8575
14     6.9     6.8960
15     6.6     6.6000
16     7.2     7.1530
17     6.6     6.6000
18     6.5     6.5000
19     7.0     6.8180


['earthquake_model.pkl']

In [5]:
import pickle
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor  # Example model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

# Load dataset
df = pd.read_csv("earthquake_1995-2023.csv")

# Data Preprocessing (Modify as per your dataset)
X = df[['latitude', 'longitude', 'depth', 'cdi', 'mmi', 'tsunami', 'sig', 'dmin', 'gap', 'nst']]
y = df['magnitude']  # Target column

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Save the trained model as a pipeline
with open("earthquake_model.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("Model saved successfully as earthquake_model.pkl")


Model saved successfully as earthquake_model.pkl
