In [62]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from scipy.stats import zscore

csv_path = "./data/Landslide3.csv"
device = "101"
save_path = f"./data/dev{device}_resample10S.csv"
#save_path = f"data/devAll2_prepared.csv"

df = pd.read_csv(csv_path)

for col in ['soil', 'rain', 'temp', 'humi', 'geo', 'lat', 'lng']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df

#df = df[~df['timestamp'].str.contains(r'\(ICT\)', na=False)]
#df['timestamp'] = df['timestamp'].str.replace(r'\(ICT\)', '', regex=True).str.strip()

def clean_and_format_timestamp(val):
    if pd.isnull(val):
        return None
    val = str(val).replace('(ICT)', '').strip()
    try:
        dt = pd.to_datetime(val)
        return dt.isoformat()
    except Exception:
        return None  

df['timestamp'] = df['timestamp'].apply(clean_and_format_timestamp)

df.columns = df.columns.str.strip()
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df = df.dropna(subset=['timestamp', 'geo'])  # ต้องมี timestamp และ geo


df = df.drop(columns=['unixt', 'gateway time', 'old time', 'risk_level', 'confidence'], errors='ignore')
df = df[df['devID'] != 218499049.0]
df = df[(df['lat'] > 1.0) & (df['lat'] < 8000.0)]
df = df[df['lng'] > 15.0]
df = df[df['soil'] != 0.0]
df = df[df['rain'] != -1.0]

df = df[df['devID'] == int(device)]

  df = pd.read_csv(csv_path)


In [63]:
start_date = "2025-07-20 16:30:00"
end_date = "2025-07-20 18:30:00"

df = df[(df['timestamp'] >= start_date) & (df['timestamp'] <= end_date)]
print(df['timestamp'].min())
print(df['timestamp'].max())

2025-07-20 16:30:01.590000
2025-07-20 18:29:22.929000


In [64]:
df.describe()

Unnamed: 0,timestamp,devID,soil,rain,temp,humi,geo,lat,lng
count,289,289.0,289.0,289.0,289.0,289.0,289.0,289.0,289.0
mean,2025-07-20 17:29:41.123065856,101.0,33.266782,0.0,26.52813,78.262385,12.508778,8.638012,99.898959
min,2025-07-20 16:30:01.590000,101.0,32.799999,0.0,25.571447,72.278778,0.534691,8.637995,99.898933
25%,2025-07-20 16:58:09.808999936,101.0,33.0,0.0,26.158922,76.675285,8.717299,8.638003,99.898941
50%,2025-07-20 17:32:41.072000,101.0,33.099998,0.0,26.660946,77.6614,12.460935,8.638013,99.898964
75%,2025-07-20 17:58:39.852999936,101.0,33.5,0.0,26.869228,79.896851,15.756296,8.638019,99.898972
max,2025-07-20 18:29:22.929000,101.0,34.099998,0.0,27.454033,83.240486,32.048397,8.638028,99.898979
std,,0.0,0.304601,0.0,0.419896,2.08207,7.244321,9e-06,1.3e-05


In [None]:

window = '1T'  # 10 วินาที (10 seconds) - Corrected comment
features = ['soil', 'rain', 'temp', 'humi', 'geo', 'lat', 'lng']

df = df.sort_values('timestamp').reset_index(drop=True)

# Ensure timestamp is datetime and set as index for the base DataFrame used for resampling
if 'timestamp' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['timestamp']):
    df['timestamp'] = pd.to_datetime(df['timestamp'])

df_resample_base = df.set_index('timestamp')

df_rolling = df_resample_base[features].resample(window).agg(['mean', 'std', 'min', 'max'])
# --- END CORRECTED LINES ---

df_rolling.columns = ['_'.join(col) for col in df_rolling.columns]
df_rolling['devID'] = int(device)
df_rolling = df_rolling.dropna()

df_rolling = df_rolling.reset_index()
df_rolling = df_rolling[['timestamp', 'devID', 'soil_mean', 'rain_mean', 'temp_mean', 'humi_mean', 'geo_mean']]
df_rolling

  df_rolling = df_resample_base[features].resample(window).agg(['mean', 'std', 'min', 'max'])


Unnamed: 0,timestamp,devID,soil_mean,rain_mean,temp_mean,humi_mean,geo_mean


In [66]:
df_rolling.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   timestamp  0 non-null      datetime64[ns]
 1   devID      0 non-null      int64         
 2   soil_mean  0 non-null      float64       
 3   rain_mean  0 non-null      float64       
 4   temp_mean  0 non-null      float64       
 5   humi_mean  0 non-null      float64       
 6   geo_mean   0 non-null      float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 124.0 bytes


In [67]:
nan_counts = df_rolling.isna().sum()

nan_columns = nan_counts[nan_counts > 0]

print("📌 คอลัมน์ที่มีค่า NaN:")
print(nan_columns)

📌 คอลัมน์ที่มีค่า NaN:
Series([], dtype: int64)


In [68]:

df_rolling.to_csv(f"{save_path}", index=False)

print("✅ บันทึก เรียบร้อยแล้ว", save_path)

✅ บันทึก เรียบร้อยแล้ว ./data/dev101_resample10S.csv
