In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from scipy.stats import zscore

csv_path = "./data/Landslide_dataSet1.csv"
device = "101"
save_path = f"./data/dev{device}_resample1T.csv"
#save_path = f"data/devAll2_prepared.csv"

df = pd.read_csv(csv_path)

for col in ['soil', 'rain', 'temp', 'humi', 'geo', 'lat', 'lng']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df

#df = df[~df['timestamp'].str.contains(r'\(ICT\)', na=False)]
#df['timestamp'] = df['timestamp'].str.replace(r'\(ICT\)', '', regex=True).str.strip()

def clean_and_format_timestamp(val):
    if pd.isnull(val):
        return None
    val = str(val).replace('(ICT)', '').strip()
    try:
        dt = pd.to_datetime(val)
        return dt.isoformat()
    except Exception:
        return None  

df['timestamp'] = df['timestamp'].apply(clean_and_format_timestamp)

df.columns = df.columns.str.strip()
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df = df.dropna(subset=['timestamp', 'geo'])  # ต้องมี timestamp และ geo


df = df.drop(columns=['unixt', 'gateway time', 'old time', 'risk_level', 'confidence'], errors='ignore')
df = df[df['devID'] != 218499049.0]
df = df[(df['lat'] > 1.0) & (df['lat'] < 8000.0)]
df = df[df['lng'] > 15.0]
df = df[df['soil'] != 0.0]
df = df[df['rain'] != -1.0]

df = df[df['devID'] == int(device)]

  df = pd.read_csv(csv_path)


In [3]:
#start_date = "2025-07-20 16:30:00"
#end_date = "2025-07-20 18:30:00"
#
#df = df[(df['timestamp'] >= start_date) & (df['timestamp'] <= end_date)]
#print(df['timestamp'].min())
#print(df['timestamp'].max())

In [4]:
df.describe()

Unnamed: 0,timestamp,devID,soil,rain,temp,humi,geo,lat,lng
count,186716,186716.0,186716.0,186716.0,186716.0,186716.0,186716.0,186716.0,186716.0
mean,2025-06-11 23:05:53.934464256,101.0,43.662804,0.178172,29.789759,75.000908,11.191679,8.638012,99.903766
min,2025-05-02 11:42:06.635276,101.0,21.9,0.0,24.279013,46.977417,0.0,8.63794,99.166664
25%,2025-05-30 23:59:11.089130496,101.0,30.700001,0.0,27.42466,70.10437,4.957915,8.638005,99.898964
50%,2025-06-14 05:29:32.704999936,101.0,46.400002,0.0,28.551537,78.691383,11.354394,8.638015,99.898972
75%,2025-06-27 17:53:07.887500032,101.0,52.599998,0.0,32.279316,80.631187,15.290937,8.638023,99.898979
max,2025-07-22 19:39:41.789000,101.0,66.599998,41.6306,38.645382,87.270775,32.062466,8.638072,995.549988
std,,0.0,12.374522,1.517653,3.072548,8.343398,6.830587,1.8e-05,2.072756


In [5]:

window = '1T'  # 10 วินาที (10 seconds) - Corrected comment
features = ['soil', 'rain', 'temp', 'humi', 'geo', 'lat', 'lng']

df = df.sort_values('timestamp').reset_index(drop=True)

# Ensure timestamp is datetime and set as index for the base DataFrame used for resampling
if 'timestamp' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['timestamp']):
    df['timestamp'] = pd.to_datetime(df['timestamp'])

df_resample_base = df.set_index('timestamp')

df_rolling = df_resample_base[features].resample(window).agg(['mean', 'std', 'min', 'max'])
# --- END CORRECTED LINES ---

df_rolling.columns = ['_'.join(col) for col in df_rolling.columns]
df_rolling['devID'] = int(device)
df_rolling = df_rolling.dropna()
df_rolling = df_rolling.reset_index()

df_rolling['hour'] = df_rolling['timestamp'].dt.hour
df_rolling = df_rolling[['timestamp', 'devID', 'soil_mean', 'rain_mean', 'temp_mean', 'humi_mean', 'geo_mean', 'hour']]
df_rolling

  df_rolling = df_resample_base[features].resample(window).agg(['mean', 'std', 'min', 'max'])


Unnamed: 0,timestamp,devID,soil_mean,rain_mean,temp_mean,humi_mean,geo_mean,hour
0,2025-05-02 11:42:00,101,28.839999,0.0,34.233463,63.098955,14.622758,11
1,2025-05-02 11:43:00,101,28.816666,0.0,34.047519,63.319192,10.538904,11
2,2025-05-02 11:44:00,101,28.799999,0.0,34.081788,63.662011,9.419168,11
3,2025-05-02 11:45:00,101,28.799999,0.0,34.297550,63.891277,9.548765,11
4,2025-05-02 11:46:00,101,28.799999,0.0,34.542156,63.798203,10.704843,11
...,...,...,...,...,...,...,...,...
42023,2025-07-22 19:35:00,101,25.600000,0.0,28.643663,74.166607,17.514741,19
42024,2025-07-22 19:36:00,101,25.600000,0.0,28.643216,74.338114,12.892563,19
42025,2025-07-22 19:37:00,101,25.600000,0.0,28.633781,74.290681,10.977404,19
42026,2025-07-22 19:38:00,101,25.600000,0.0,28.649004,74.129890,8.432374,19


In [6]:
df_rolling.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42028 entries, 0 to 42027
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   timestamp  42028 non-null  datetime64[ns]
 1   devID      42028 non-null  int64         
 2   soil_mean  42028 non-null  float64       
 3   rain_mean  42028 non-null  float64       
 4   temp_mean  42028 non-null  float64       
 5   humi_mean  42028 non-null  float64       
 6   geo_mean   42028 non-null  float64       
 7   hour       42028 non-null  int32         
dtypes: datetime64[ns](1), float64(5), int32(1), int64(1)
memory usage: 2.4 MB


In [7]:
nan_counts = df_rolling.isna().sum()

nan_columns = nan_counts[nan_counts > 0]

print("📌 คอลัมน์ที่มีค่า NaN:")
print(nan_columns)

📌 คอลัมน์ที่มีค่า NaN:
Series([], dtype: int64)


In [8]:

df_rolling.to_csv(f"{save_path}", index=False)

print("✅ บันทึก เรียบร้อยแล้ว", save_path)

✅ บันทึก เรียบร้อยแล้ว ./data/dev101_resample1T.csv
