In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from scipy.stats import zscore

csv_path = "./data/Landslide_dataSet1.csv"
device = "101"
save_path = f"./data/dev{device}_resample30s_set1.csv"
#save_path = f"data/devAll2_prepared.csv"

df = pd.read_csv(csv_path)

for col in ['soil', 'rain', 'temp', 'humi', 'geo', 'lat', 'lng']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df

#df = df[~df['timestamp'].str.contains(r'\(ICT\)', na=False)]
#df['timestamp'] = df['timestamp'].str.replace(r'\(ICT\)', '', regex=True).str.strip()

def clean_and_format_timestamp(val):
    if pd.isnull(val):
        return None
    val = str(val).replace('(ICT)', '').strip()
    try:
        dt = pd.to_datetime(val)
        return dt.isoformat()
    except Exception:
        return None  

df['timestamp'] = df['timestamp'].apply(clean_and_format_timestamp)

df.columns = df.columns.str.strip()
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df = df.dropna(subset=['timestamp', 'geo'])  # ต้องมี timestamp และ geo


df = df.drop(columns=['unixt', 'gateway time', 'old time', 'risk_level', 'confidence'], errors='ignore')
df = df[df['devID'] != 218499049.0]
df = df[(df['lat'] > 1.0) & (df['lat'] < 8000.0)]
df = df[df['lng'] > 15.0]
df = df[df['soil'] != 0.0]
df = df[df['rain'] != -1.0]

df = df[df['devID'] == int(device)]

  df = pd.read_csv(csv_path)


In [9]:
start_date = "2025-05-02 12:00:00"
end_date = "2025-05-04 12:00:00"

df = df[(df['timestamp'] >= start_date) & (df['timestamp'] <= end_date)]
print(df['timestamp'].min())
print(df['timestamp'].max())

2025-05-02 12:00:04.158718
2025-05-04 11:59:37.162185


In [10]:
df.describe()

Unnamed: 0,timestamp,devID,soil,rain,temp,humi,geo,lat,lng
count,12955,12955.0,12955.0,12955.0,12955.0,12955.0,12955.0,12955.0,12955.0
mean,2025-05-03 09:59:02.216766976,101.0,43.131648,0.913728,29.814618,76.849038,10.837309,8.637986,99.899004
min,2025-05-02 12:00:04.158718,101.0,27.299999,0.0,25.048067,59.45739,0.515554,8.637954,99.898972
25%,2025-05-02 22:09:03.004602112,101.0,27.9,0.0,27.462044,71.697029,7.343577,8.637979,99.898994
50%,2025-05-03 08:49:04.715566080,101.0,49.0,0.0,28.960096,79.551613,9.774537,8.637985,99.899002
75%,2025-05-03 21:24:04.784700416,101.0,50.700001,0.0,32.690544,80.966888,14.575957,8.637992,99.89901
max,2025-05-04 11:59:37.162185,101.0,55.799999,41.6306,35.665295,87.270775,32.059425,8.638022,99.899033
std,,0.0,10.596553,3.97264,2.758014,6.405804,5.474844,1.2e-05,1.2e-05


In [11]:

window = '30s'  # 10 วินาที (10 seconds) - Corrected comment
features = ['soil', 'rain', 'temp', 'humi', 'geo', 'lat', 'lng']

df = df.sort_values('timestamp').reset_index(drop=True)

# Ensure timestamp is datetime and set as index for the base DataFrame used for resampling
if 'timestamp' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['timestamp']):
    df['timestamp'] = pd.to_datetime(df['timestamp'])

df_resample_base = df.set_index('timestamp')

df_rolling = df_resample_base[features].resample(window).agg(['mean', 'std', 'min', 'max'])
# --- END CORRECTED LINES ---

df_rolling.columns = ['_'.join(col) for col in df_rolling.columns]
df_rolling['devID'] = int(device)
df_rolling = df_rolling.dropna()
df_rolling = df_rolling.reset_index()

df_rolling['hour'] = df_rolling['timestamp'].dt.hour
df_rolling = df_rolling[['timestamp', 'devID', 'soil_mean', 'rain_mean', 'temp_mean', 'humi_mean', 'geo_mean', 'hour']]
df_rolling

Unnamed: 0,timestamp,devID,soil_mean,rain_mean,temp_mean,humi_mean,geo_mean,hour
0,2025-05-02 12:00:00,101,28.766666,0.0,35.233590,61.564409,14.308251,12
1,2025-05-02 12:00:30,101,28.799999,0.0,35.331501,62.408737,15.891333,12
2,2025-05-02 12:01:00,101,28.799999,0.0,35.379566,61.602236,14.088287,12
3,2025-05-02 12:01:30,101,28.766666,0.0,35.430302,63.564610,15.119175,12
4,2025-05-02 12:02:00,101,28.766666,0.0,35.508638,64.492868,14.641307,12
...,...,...,...,...,...,...,...,...
4881,2025-05-04 11:55:00,101,48.366667,0.0,33.772026,69.169754,10.193673,11
4882,2025-05-04 11:56:30,101,48.400002,0.0,33.766685,69.380520,14.183249,11
4883,2025-05-04 11:57:00,101,48.400002,0.0,33.756004,67.130768,8.703913,11
4884,2025-05-04 11:57:30,101,48.350000,0.0,33.789387,68.244675,8.998999,11


In [12]:
df_rolling.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4886 entries, 0 to 4885
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   timestamp  4886 non-null   datetime64[ns]
 1   devID      4886 non-null   int64         
 2   soil_mean  4886 non-null   float64       
 3   rain_mean  4886 non-null   float64       
 4   temp_mean  4886 non-null   float64       
 5   humi_mean  4886 non-null   float64       
 6   geo_mean   4886 non-null   float64       
 7   hour       4886 non-null   int32         
dtypes: datetime64[ns](1), float64(5), int32(1), int64(1)
memory usage: 286.4 KB


In [13]:
nan_counts = df_rolling.isna().sum()

nan_columns = nan_counts[nan_counts > 0]

print("📌 คอลัมน์ที่มีค่า NaN:")
print(nan_columns)

📌 คอลัมน์ที่มีค่า NaN:
Series([], dtype: int64)


In [14]:

df_rolling.to_csv(f"{save_path}", index=False)

print("✅ บันทึก เรียบร้อยแล้ว", save_path)

✅ บันทึก เรียบร้อยแล้ว ./data/dev101_resample30s_set1.csv
