In [1]:
import pandas as pd

# Raw data

In [2]:
data = pd.read_csv("data_generative.csv", index_col=0)

In [3]:
data

Unnamed: 0,Magnitude,Place,Time,Longitude,Latitude,Depth
0,4.8,"108 km ESE of Tatsugō, Japan",1704033458234,130.6658,28.1762,24.420
1,4.3,"116 km SE of Kuril’sk, Russia",1703996366515,148.9932,44.5454,35.000
2,4.2,"Maug Islands region, Northern Mariana Islands",1703991740201,145.6748,20.0351,123.949
3,4.2,"64 km NE of Otrada, Russia",1703981451692,146.4847,44.4452,155.813
4,4.4,"108 km SE of Kuril’sk, Russia",1703980548472,148.9343,44.6077,35.000
...,...,...,...,...,...,...
8637,4.7,"101 km ENE of Shikotan, Russia",1514897395840,147.9473,44.0175,69.860
8638,4.5,"7 km NW of Ichihara, Japan",1514890119800,140.0338,35.5756,42.220
8639,4.4,"Volcano Islands, Japan region",1514861626720,142.9983,23.1461,91.110
8640,4.2,"87 km ESE of Yujing, Taiwan",1514834523580,121.2799,22.9136,10.000


# Reshaping data

Below we save, for each sequence of 6 earthquakes, the times, latitudes, longitudes and magnitudes of 5 earthquakes, besides the time of the last earthquake (target column).

In [4]:
data = data.sort_values(["Time"])
# convert to seconds
data['Time'] = pd.to_numeric(data['Time']) // 1000

In [5]:
# Group by each 6 rows - check last group
groups = [data.iloc[i:i+6] for i in range(0, len(data), 6)]
dfs = []

for group in groups:
    df = pd.DataFrame()
    for idx, row in group.reset_index().iterrows():
        df.loc[0,f"Time{idx}"] = row["Time"]
        if idx < 5:
            df.loc[0,f"Longitude{idx}"] = row["Longitude"]
            df.loc[0,f"Latitude{idx}"] = row["Latitude"]
            df.loc[0,f"Magnitude{idx}"] = row["Magnitude"]
    dfs.append(df)

In [None]:
df = pd.concat(dfs).reset_index(drop=True)
df.dropna(inplace=True)

In [None]:
df

Unnamed: 0,Time0,Longitude0,Latitude0,Magnitude0,Time1,Longitude1,Latitude1,Magnitude1,Time2,Longitude2,...,Magnitude2,Time3,Longitude3,Latitude3,Magnitude3,Time4,Longitude4,Latitude4,Magnitude4,Time5
0,1.514816e+12,141.8955,38.2761,4.3,1.514835e+12,121.2799,22.9136,4.2,1.514862e+12,142.9983,...,4.4,1.514890e+12,140.0338,35.5756,4.5,1.514897e+12,147.9473,44.0175,4.7,1.514921e+12
1,1.514928e+12,142.0927,37.3027,4.3,1.514935e+12,140.5556,29.5079,4.5,1.514982e+12,140.4409,...,4.7,1.514982e+12,140.3036,29.5489,4.6,1.514992e+12,140.3128,29.5928,4.3,1.515008e+12
2,1.515015e+12,140.5205,29.6374,5.5,1.515015e+12,140.5166,29.6113,5.1,1.515030e+12,143.0351,...,4.3,1.515047e+12,140.1703,35.5957,4.4,1.515068e+12,124.6215,45.4416,4.3,1.515072e+12
3,1.515076e+12,140.5203,29.5673,4.6,1.515076e+12,140.2602,29.6604,4.6,1.515118e+12,140.7786,...,4.4,1.515124e+12,140.5766,29.6055,4.6,1.515150e+12,141.3471,35.2909,4.3,1.515168e+12
4,1.515181e+12,139.2678,34.8460,4.5,1.515185e+12,139.2303,34.8211,4.7,1.515235e+12,142.3725,...,4.7,1.515246e+12,142.5355,40.7851,4.6,1.515317e+12,141.2623,36.3315,4.2,1.515369e+12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435,1.703578e+12,128.5175,25.9254,4.4,1.703603e+12,139.7096,31.5109,4.8,1.703618e+12,131.8030,...,4.6,1.703630e+12,128.6231,26.0895,4.0,1.703631e+12,142.2525,41.9589,4.7,1.703634e+12
1436,1.703648e+12,145.2254,20.2049,4.2,1.703755e+12,149.0388,44.5960,6.5,1.703756e+12,149.0713,...,5.0,1.703758e+12,149.0796,44.4420,4.5,1.703764e+12,148.8750,44.5081,4.6,1.703767e+12
1437,1.703769e+12,142.4133,40.5601,4.8,1.703775e+12,128.9747,27.8668,4.5,1.703776e+12,149.0767,...,4.6,1.703778e+12,148.8878,44.4857,4.3,1.703782e+12,149.0500,44.4510,4.4,1.703812e+12
1438,1.703820e+12,149.1838,44.4931,4.3,1.703825e+12,141.7204,32.3046,4.9,1.703829e+12,141.8624,...,4.2,1.703859e+12,145.2178,20.4001,4.2,1.703913e+12,143.1610,27.7127,4.5,1.703916e+12


In [None]:
df.to_csv("data_regression.csv")

# Saving to data folder

In [None]:
df.iloc[:800].to_csv("data/public/train.csv", index=False)
df.iloc[800:1000].to_csv("data/public/test.csv", index=False)
df.iloc[1000:1352].to_csv("data/train.csv", index=False)
df.iloc[1352:1440].to_csv("data/test.csv", index=False)