In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from scipy.stats import zscore

csv_path = "./data/Landslide_dataSet2.csv"
device = "101"
save_path = f"./data/dev{device}_resample1T_set2.csv"
#save_path = f"data/devAll2_prepared.csv"

df = pd.read_csv(csv_path)

for col in ['soil', 'rain', 'temp', 'humi', 'geo', 'lat', 'lng']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df

#df = df[~df['timestamp'].str.contains(r'\(ICT\)', na=False)]
#df['timestamp'] = df['timestamp'].str.replace(r'\(ICT\)', '', regex=True).str.strip()

def clean_and_format_timestamp(val):
    if pd.isnull(val):
        return None
    val = str(val).replace('(ICT)', '').strip()
    try:
        dt = pd.to_datetime(val)
        return dt.isoformat()
    except Exception:
        return None  

df['timestamp'] = df['timestamp'].apply(clean_and_format_timestamp)

df.columns = df.columns.str.strip()
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df = df.dropna(subset=['timestamp', 'geo'])  # ต้องมี timestamp และ geo


df = df.drop(columns=['unixt', 'gateway time', 'old time', 'risk_level', 'confidence'], errors='ignore')
df = df[df['devID'] != 218499049.0]
df = df[(df['lat'] > 1.0) & (df['lat'] < 8000.0)]
df = df[df['lng'] > 15.0]
df = df[df['soil'] != 0.0]
df = df[df['rain'] != -1.0]

df = df[df['devID'] == int(device)]

In [2]:
#start_date = "2025-07-20 16:30:00"
#end_date = "2025-07-20 18:30:00"
#
#df = df[(df['timestamp'] >= start_date) & (df['timestamp'] <= end_date)]
#print(df['timestamp'].min())
#print(df['timestamp'].max())

In [3]:
df.describe()

Unnamed: 0,timestamp,devID,soil,rain,temp,humi,geo,lat,lng
count,48900,48900.0,48900.0,48900.0,48900.0,48900.0,48900.0,48900.0,48900.0
mean,2025-08-04 06:05:47.606019328,101.0,23.117567,0.065885,30.517649,73.588875,11.287078,8.638025,99.917283
min,2025-07-30 03:28:43.522000,101.0,20.5,0.0,25.934618,46.754253,0.504865,8.637975,99.599998
25%,2025-08-02 15:27:05.390000128,101.0,21.0,0.0,27.721066,68.219879,5.244239,8.638014,99.898964
50%,2025-08-04 09:03:10.540499968,101.0,21.9,0.0,29.45945,77.927483,12.048141,8.638025,99.898972
75%,2025-08-05 23:50:17.276249856,101.0,25.5,0.0,33.304722,80.31266,15.524412,8.638036,99.898979
max,2025-08-07 15:27:13.502000,101.0,37.5,15.367,38.156708,85.592285,32.060558,8.638075,995.616638
std,,0.0,2.563139,0.854072,3.23616,9.541226,5.903769,1.6e-05,4.050576


In [4]:

window = '1T'  # 10 วินาที (10 seconds) - Corrected comment
features = ['soil', 'rain', 'temp', 'humi', 'geo', 'lat', 'lng']

df = df.sort_values('timestamp').reset_index(drop=True)

# Ensure timestamp is datetime and set as index for the base DataFrame used for resampling
if 'timestamp' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['timestamp']):
    df['timestamp'] = pd.to_datetime(df['timestamp'])

df_resample_base = df.set_index('timestamp')

df_rolling = df_resample_base[features].resample(window).agg(['mean', 'std', 'min', 'max'])
# --- END CORRECTED LINES ---

df_rolling.columns = ['_'.join(col) for col in df_rolling.columns]
df_rolling['devID'] = int(device)
df_rolling = df_rolling.dropna()
df_rolling = df_rolling.reset_index()

df_rolling['hour'] = df_rolling['timestamp'].dt.hour
df_rolling = df_rolling[['timestamp', 'devID', 'soil_mean', 'rain_mean', 'temp_mean', 'humi_mean', 'geo_mean', 'hour']]
df_rolling

  df_rolling = df_resample_base[features].resample(window).agg(['mean', 'std', 'min', 'max'])


Unnamed: 0,timestamp,devID,soil_mean,rain_mean,temp_mean,humi_mean,geo_mean,hour
0,2025-07-30 03:28:00,101,24.414285,0.0,35.535593,54.899300,15.213561,3
1,2025-07-30 03:29:00,101,24.400000,0.0,34.787201,56.469720,13.640666,3
2,2025-07-30 03:30:00,101,24.400000,0.0,34.894142,55.658301,11.142177,3
3,2025-07-30 03:31:00,101,24.411764,0.0,34.682299,56.552902,13.287289,3
4,2025-07-30 03:32:00,101,24.400000,0.0,34.754712,56.353661,14.540755,3
...,...,...,...,...,...,...,...,...
9497,2025-08-07 15:23:00,101,20.700001,0.0,29.345694,81.485698,15.057373,15
9498,2025-08-07 15:24:00,101,20.700001,0.0,29.566932,80.309319,9.233775,15
9499,2025-08-07 15:25:00,101,20.700001,0.0,29.307776,81.511635,13.560988,15
9500,2025-08-07 15:26:00,101,20.700001,0.0,29.329270,81.408924,12.249063,15


In [5]:
df_rolling.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9502 entries, 0 to 9501
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   timestamp  9502 non-null   datetime64[ns]
 1   devID      9502 non-null   int64         
 2   soil_mean  9502 non-null   float64       
 3   rain_mean  9502 non-null   float64       
 4   temp_mean  9502 non-null   float64       
 5   humi_mean  9502 non-null   float64       
 6   geo_mean   9502 non-null   float64       
 7   hour       9502 non-null   int32         
dtypes: datetime64[ns](1), float64(5), int32(1), int64(1)
memory usage: 556.9 KB


In [6]:
nan_counts = df_rolling.isna().sum()

nan_columns = nan_counts[nan_counts > 0]

print("📌 คอลัมน์ที่มีค่า NaN:")
print(nan_columns)

📌 คอลัมน์ที่มีค่า NaN:
Series([], dtype: int64)


In [7]:

df_rolling.to_csv(f"{save_path}", index=False)

print("✅ บันทึก เรียบร้อยแล้ว", save_path)

✅ บันทึก เรียบร้อยแล้ว ./data/dev101_resample1T_set2.csv
