In [36]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [45]:
# Define column names to assign
column_names = ['location', 'time', 'deg_min', 'deg_max', 'hum_min', 'hum_max', 'humidity', 'temperature', 'weather_code', 'wind_direction', 'wind_speed']

# List of the dataset filenames
dataset_files = [
    'kecamatanforecast-jakarta.csv',
    'kecamatanforecast-jambi.csv',
    'kecamatanforecast-jawatengah.csv',
    'kecamatanforecast-jawatimur.csv',
    'kecamatanforecast-sumut.csv'
]

# Read each dataset, assign column names, and drop unwanted columns
datasets = []
for file in dataset_files:
    # Read the CSV file with ";" as the separator
    df = pd.read_csv(file, sep=';', header=None)

    # Assign column names
    df.columns = column_names

    # Split the 'time' column into 'date' and 'time'
    df[['date', 'time']] = df['time'].str.split(' ', expand=True)

    # Reorder the columns to have 'date' first, followed by 'time'
    df = df[['date', 'time'] + [col for col in df.columns if col not in ['date', 'time']]]

    # Drop unnecessary columns
    df = df.drop(columns=['deg_min', 'deg_max', 'hum_min', 'hum_max', 'wind_direction'])

    # Append the cleaned dataset to the list
    datasets.append(df)

# Combine all datasets into one
combined_df = pd.concat(datasets, ignore_index=True)

# Show the updated dataset (optional)
print(combined_df.head())

# Save the combined dataframe to a new file
combined_df.to_csv('cleemate-dataset.csv', sep=';', index=False)


         date      time  location  humidity  temperature  weather_code  \
0  2024-11-25  00:00:00    501191        88           26             4   
1  2024-11-25  01:00:00    501191        81           27             4   
2  2024-11-25  02:00:00    501191        73           29             4   
3  2024-11-25  03:00:00    501191        66           31             4   
4  2024-11-25  04:00:00    501191        63           31             4   

   wind_speed  
0           1  
1           1  
2           1  
3           1  
4           1  


In [46]:
# Load the kecamatanforecast-jawatimur-fix dataset (semicolon-separated)
df_forecast = pd.read_csv('cleemate-dataset.csv', sep=';')

# Load the kecamatan_geofeatures dataset (semicolon-separated)
df_geofeatures = pd.read_csv('kecamatan_geofeatures.csv', sep=';')

# Merge the datasets on the 'location' column to get the corresponding 'kota' and 'provinsi' values
merged_df = pd.merge(df_forecast, df_geofeatures[['location', 'kota', 'provinsi']], on='location', how='left')

# Replace 'location' column with 'kota' column
merged_df['location'] = merged_df['kota']

# Drop the 'kota' column, as we no longer need it
merged_df = merged_df.drop(columns=['kota'])

# Rearrange columns so that 'provinsi' comes after 'location'
columns = ['location', 'provinsi'] + [col for col in merged_df.columns if col not in ['location', 'provinsi']]
merged_df = merged_df[columns]

# Show the updated dataframe (first few rows)
print(merged_df.head())

# Optionally, save the updated dataframe to a new CSV file
merged_df.to_csv('cleemate-dataset-fix.csv', sep=';', index=False)


             location     provinsi        date      time  humidity  \
0  Kota Jakarta Timur  DKI Jakarta  2024-11-25  00:00:00        88   
1  Kota Jakarta Timur  DKI Jakarta  2024-11-25  01:00:00        81   
2  Kota Jakarta Timur  DKI Jakarta  2024-11-25  02:00:00        73   
3  Kota Jakarta Timur  DKI Jakarta  2024-11-25  03:00:00        66   
4  Kota Jakarta Timur  DKI Jakarta  2024-11-25  04:00:00        63   

   temperature  weather_code  wind_speed  
0           26             4           1  
1           27             4           1  
2           29             4           1  
3           31             4           1  
4           31             4           1  


In [47]:
# Load the kecamatanforecast-jawatimur-fix dataset (semicolon-separated)
df_forecast = pd.read_csv('cleemate-dataset-fix.csv', sep=';')

# Load the weather dataset (semicolon-separated)
df_weather = pd.read_csv('weather.csv', sep=';')

# Merge the datasets on the 'weather_code' column to get the corresponding 'weather' values
merged_df = pd.merge(df_forecast, df_weather[['weather_code', 'weather']], on='weather_code', how='left')

# Optionally, you can rename the 'weather' column to a more descriptive name, but do not replace 'weather_code'
# If you want to keep 'weather_code' and 'weather' columns, no need to replace 'weather_code'
# Just ensure the 'weather' column is included in the merged dataset.

# Show the updated dataframe (first few rows)
print(merged_df.head())

# Optionally, save the updated dataframe to a new CSV file
merged_df.to_csv('cleemate-dataset-fix.csv', sep=';', index=False)


             location     provinsi        date      time  humidity  \
0  Kota Jakarta Timur  DKI Jakarta  2024-11-25  00:00:00        88   
1  Kota Jakarta Timur  DKI Jakarta  2024-11-25  01:00:00        81   
2  Kota Jakarta Timur  DKI Jakarta  2024-11-25  02:00:00        73   
3  Kota Jakarta Timur  DKI Jakarta  2024-11-25  03:00:00        66   
4  Kota Jakarta Timur  DKI Jakarta  2024-11-25  04:00:00        63   

   temperature  weather_code  wind_speed        weather  
0           26             4           1  Berawan Tebal  
1           27             4           1  Berawan Tebal  
2           29             4           1  Berawan Tebal  
3           31             4           1  Berawan Tebal  
4           31             4           1  Berawan Tebal  


In [48]:
# Load the dataset (semicolon-separated)
df_forecast = pd.read_csv('cleemate-dataset-fix.csv', sep=';')

# Create necessary columns first (before imputation)
df_forecast['temp_change'] = df_forecast['temperature'].diff().fillna(0)
df_forecast['wind_speed_change'] = df_forecast['wind_speed'].diff().fillna(0)
df_forecast['humidity_change'] = df_forecast['humidity'].diff().fillna(0)

# Create lag features for 1-day lookback (shift the values by 1)
df_forecast['temp_previous_day'] = df_forecast['temperature'].shift(1)
df_forecast['wind_speed_previous_day'] = df_forecast['wind_speed'].shift(1)
df_forecast['humidity_previous_day'] = df_forecast['humidity'].shift(1)

# Rolling averages over the past 5 columns for temperature, wind speed, and humidity
df_forecast['rolling_temp'] = df_forecast['temperature'].rolling(window=5).mean()
df_forecast['rolling_wind'] = df_forecast['wind_speed'].rolling(window=5).mean()
df_forecast['rolling_humidity'] = df_forecast['humidity'].rolling(window=5).mean()

# Handle missing values by imputing (using median or mean)
imputer = SimpleImputer(strategy='median')
df_forecast[['humidity', 'temperature', 'wind_speed', 'temp_change', 'wind_speed_change',
             'humidity_change', 'temp_previous_day', 'wind_speed_previous_day',
             'humidity_previous_day', 'rolling_temp', 'rolling_wind', 'rolling_humidity']] = imputer.fit_transform(
    df_forecast[['humidity', 'temperature', 'wind_speed', 'temp_change', 'wind_speed_change',
                 'humidity_change', 'temp_previous_day', 'wind_speed_previous_day',
                 'humidity_previous_day', 'rolling_temp', 'rolling_wind', 'rolling_humidity']])

# Convert 'weather_code' into one-hot encoded features
weather_encoder = OneHotEncoder(sparse_output=False)
weather_code_encoded = weather_encoder.fit_transform(df_forecast[['weather_code']])

# Create a DataFrame from the one-hot encoded features and join with the original dataframe
weather_code_df = pd.DataFrame(weather_code_encoded, columns=weather_encoder.get_feature_names_out(['weather_code']))
df_forecast = pd.concat([df_forecast, weather_code_df], axis=1)

# Define extreme fluctuation (label = 1 for extreme change, 0 otherwise)
df_forecast['extreme_fluctuation'] = ((df_forecast['temp_change'].abs() > 5) |
                                      (df_forecast['wind_speed_change'].abs() > 10) |
                                      (df_forecast['humidity_change'].abs() > 10)).astype(int)

# Drop rows with missing data due to lag or rolling operations
df_forecast = df_forecast.dropna(subset=['extreme_fluctuation'])

# Features (X) and Target (y)
X = df_forecast[['humidity', 'temperature', 'wind_speed', 'temp_change', 'wind_speed_change',
                 'humidity_change', 'temp_previous_day', 'wind_speed_previous_day',
                 'humidity_previous_day', 'rolling_temp', 'rolling_wind', 'rolling_humidity'] + list(weather_code_df.columns)]
y = df_forecast['extreme_fluctuation']

# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Handle class imbalance using SMOTE (oversampling the minority class)
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [50]:
# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train_resampled.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification (0 or 1)
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=50, batch_size=64, validation_data=(X_test, y_test))



Epoch 1/50
[1m4089/4089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - accuracy: 0.9461 - loss: 0.1254 - val_accuracy: 0.9849 - val_loss: 0.0325
Epoch 2/50
[1m4089/4089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.9887 - loss: 0.0304 - val_accuracy: 0.9958 - val_loss: 0.0117
Epoch 3/50
[1m4089/4089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.9925 - loss: 0.0196 - val_accuracy: 0.9938 - val_loss: 0.0140
Epoch 4/50
[1m4089/4089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.9941 - loss: 0.0159 - val_accuracy: 0.9975 - val_loss: 0.0066
Epoch 5/50
[1m4089/4089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.9952 - loss: 0.0128 - val_accuracy: 0.9958 - val_loss: 0.0112
Epoch 6/50
[1m4089/4089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 3ms/step - accuracy: 0.9959 - loss: 0.0113 - val_accuracy: 0.9962 - val_loss: 0.0103
Epoch 7/50

In [51]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc * 100:.2f}%")

y_pred = (model.predict(X_test) > 0.5).astype(int)

print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred)}")

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))


[1m2038/2038[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.9993 - loss: 0.0013
Test Accuracy: 99.94%
[1m2038/2038[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
Precision: 0.9961923411662315
Recall: 0.9994542676271556
F1 Score: 0.997820638552904
[[56008    35]
 [    5  9157]]


In [53]:
# Save the model to the runtime (local file system)
model.save('/content/cleemate-model.h5')

