In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso, ElasticNet

import sys
from memory_profiler import profile

## Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns',None)

import warnings

# Suppressing all warnings within the context
with warnings.catch_warnings():
    warnings.simplefilter("ignore")



In [2]:
image_path = f"../datasets/images"

In [3]:
final_df = pd.read_csv('../datasets/final_dataset/cyclic_scaled_data.csv')
final_df.head()

Unnamed: 0,latitude,longitude,depth,magnitude,magnitude_type,distance,gravity,force,year,day_name,month_sin,month_cos,hour_sin,hour_cos,minutes_sin,minutes_cos
0,0.578374,0.576498,-0.804832,4.1,0.0,1.205836,0.254693,-1.166843,2024,5.0,-0.5,-0.866025,-0.707107,0.707107,-0.913545,-0.406737
1,-0.092464,-0.276457,-0.242839,2.5,0.0,0.654375,-0.849625,-0.68027,2024,5.0,-0.5,-0.866025,-0.707107,0.707107,0.866025,0.5
2,1.319964,-0.969834,1.611773,3.6,0.0,0.917097,1.719603,-0.915982,2024,5.0,-0.5,-0.866025,-0.965926,0.258819,-0.669131,0.743145
3,0.440801,0.967439,-0.613616,4.0,0.0,1.251187,-0.00824,-1.205488,2024,5.0,-0.5,-0.866025,-0.965926,0.258819,0.951057,-0.309017
4,0.353824,-0.675245,-0.67262,2.72,0.0,0.781542,-0.173512,-0.79527,2024,5.0,-0.5,-0.866025,-0.965926,0.258819,0.743145,0.669131


In [4]:
final_df.shape

(314351, 16)

#### Selected Feature After Feature Engineering

In [5]:
selected_independent_feature = ['latitude', 
                                'longitude',
                                'depth',
                                'magnitude',
                                'magnitude_type',
                                'distance',
                                'gravity',
                                'force',
                                'year',
                                'month_sin',
                                'month_cos']
print(len(selected_independent_feature))
final_df = final_df[selected_independent_feature]
final_df.head()

11


Unnamed: 0,latitude,longitude,depth,magnitude,magnitude_type,distance,gravity,force,year,month_sin,month_cos
0,0.578374,0.576498,-0.804832,4.1,0.0,1.205836,0.254693,-1.166843,2024,-0.5,-0.866025
1,-0.092464,-0.276457,-0.242839,2.5,0.0,0.654375,-0.849625,-0.68027,2024,-0.5,-0.866025
2,1.319964,-0.969834,1.611773,3.6,0.0,0.917097,1.719603,-0.915982,2024,-0.5,-0.866025
3,0.440801,0.967439,-0.613616,4.0,0.0,1.251187,-0.00824,-1.205488,2024,-0.5,-0.866025
4,0.353824,-0.675245,-0.67262,2.72,0.0,0.781542,-0.173512,-0.79527,2024,-0.5,-0.866025


In [6]:
# final_df.to_csv("../datasets/final_dataset/total_data.csv", index=False)

### Select the Sample Data

    - Since your population size is 3,14,351, calculate the sizes for 25%, 50%, 80% and 100% samples:

In [7]:
# Determine the sample sizes
total_size = len(final_df)  # Total number of rows in the dataset
print("Total size: ",total_size)

sample_size_25 = int(0.25 * total_size)  #25% of the total data
sample_size_50 = int(0.50 * total_size)  # 50% of the total data
sample_size_65 = int(0.65 * total_size)  # 65% of the total data
sample_size_80 = int(0.80 * total_size)  # 80% of the total data

# Select the samples
sample_25 = final_df.sample(n=sample_size_25, random_state=42)
sample_50 = final_df.sample(n=sample_size_50, random_state=42)
sample_65 = final_df.sample(n=sample_size_65, random_state=42)
sample_80 = final_df.sample(n=sample_size_80, random_state=42)

print("sample_25: ", sample_25.shape)
print("sample_50: ", sample_50.shape)
print("sample_65: ", sample_65.shape)
print("sample_80: ", sample_80.shape)

Total size:  314351
sample_25:  (78587, 11)
sample_50:  (157175, 11)
sample_65:  (204328, 11)
sample_80:  (251480, 11)


In [8]:
# Save Sample data to file
# sample_25.to_csv("../datasets/final_dataset/sample_25.csv", index=False)
# sample_50.to_csv("../datasets/final_dataset/sample_50.csv", index=False)
# sample_65.to_csv("../datasets/final_dataset/sample_65.csv", index=False)
# sample_80.to_csv("../datasets/final_dataset/sample_80.csv", index=False)

In [9]:
df_sample = sample_25
# df_sample = sample_50
# df_sample = sample_65
# df_sample = sample_80

print("New Data Sample", df_sample.shape)

New Data Sample (78587, 11)
