In [24]:
import pandas as pd

# Load the dataset
file_path = 'weatherHistory.csv'  # Pastikan ini adalah jalur file yang benar
df = pd.read_csv(file_path)

# Convert 'Formatted Date' to datetime
df['Formatted Date'] = pd.to_datetime(df['Formatted Date'])

# Sort data by 'Formatted Date' to ensure it's in chronological order
df = df.sort_values(by='Formatted Date')

# Set 10% of the latest data as test data
test_size = 0.1
n_test = int(len(df) * test_size)

# Split the data into train and test sets
train_data = df.iloc[:-n_test]
test_data = df.iloc[-n_test:]

# Check the shape of train and test sets
print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

# Save the train and test data to separate CSV files
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)


In [25]:
# Load the dataset
file_path = 'weatherHistory.csv'  # Pastikan jalur ini sesuai dengan lokasi file Anda
df = pd.read_csv(file_path)

# Menampilkan kategori unik di kolom 'Summary' dan 'Precip Type'
summary_categories = df['Summary'].unique()
precip_type_categories = df['Precip Type'].unique()

print("Kategori unik di 'Summary':")
print(summary_categories)

print("\nKategori unik di 'Precip Type':")
print(precip_type_categories)


Kategori unik di 'Summary':
['Partly Cloudy' 'Mostly Cloudy' 'Overcast' 'Foggy'
 'Breezy and Mostly Cloudy' 'Clear' 'Breezy and Partly Cloudy'
 'Breezy and Overcast' 'Humid and Mostly Cloudy' 'Humid and Partly Cloudy'
 'Windy and Foggy' 'Windy and Overcast' 'Breezy and Foggy'
 'Windy and Partly Cloudy' 'Breezy' 'Dry and Partly Cloudy'
 'Windy and Mostly Cloudy' 'Dangerously Windy and Partly Cloudy' 'Dry'
 'Windy' 'Humid and Overcast' 'Light Rain' 'Drizzle' 'Windy and Dry'
 'Dry and Mostly Cloudy' 'Breezy and Dry' 'Rain']

Kategori unik di 'Precip Type':
['rain' 'snow' nan]


In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


In [27]:
# Membaca data training
data = pd.read_csv('train_data.csv')

# Membaca data testing
test_data = pd.read_csv('test_data.csv')


In [28]:
# Mengonversi kolom 'Formatted Date' ke UTC
data['Formatted Date'] = pd.to_datetime(data['Formatted Date'], utc=True)

# Atau menghapus zona waktu
data['Formatted Date'] = pd.to_datetime(data['Formatted Date']).dt.tz_localize(None)

# Mengekstrak fitur waktu
data['Year'] = data['Formatted Date'].dt.year
data['Month'] = data['Formatted Date'].dt.month
data['Day'] = data['Formatted Date'].dt.day
data['Hour'] = data['Formatted Date'].dt.hour


In [29]:
# Fit label encoder pada data training
label_encoder = LabelEncoder()
label_encoder.fit(data['Summary'])  # Data training

# Tangani label baru di test data
test_data['Summary'] = test_data['Summary'].apply(lambda x: x if x in label_encoder.classes_ else 'Unknown')

# Tambahkan label 'Unknown' ke LabelEncoder (jika perlu)
label_encoder.classes_ = np.append(label_encoder.classes_, 'Unknown')

# Transform test data dengan label encoder yang telah di-fit
test_data['Summary'] = label_encoder.transform(test_data['Summary'])


In [30]:
# Fitur (X) dan Label (y)
X_train = data.drop('Summary', axis=1)
y_train = data['Summary']

X_test = test_data.drop('Summary', axis=1)
y_test = test_data['Summary']

In [31]:
from sklearn.preprocessing import LabelEncoder

# Label encode 'Precip Type'
label_encoder = LabelEncoder()

# Pastikan semua kolom kategorikal sudah diencode
data['Precip Type'] = label_encoder.fit_transform(data['Precip Type'])
test_data['Precip Type'] = label_encoder.transform(test_data['Precip Type'])


In [32]:
# Menghapus kolom datetime jika ada
X_train = X_train.drop(['Formatted Date'], axis=1, errors='ignore')
X_test = X_test.drop(['Formatted Date'], axis=1, errors='ignore')


In [33]:
# One-hot encoding Daily Summary
X_train = pd.get_dummies(X_train, columns=['Daily Summary'])
X_test = pd.get_dummies(X_test, columns=['Daily Summary'])


In [47]:
# Mengecek kolom yang ada di X_train
print(X_train)

# Mengecek kolom yang ada di X_test
print(X_test)


       Precip Type  Temperature (C)  Apparent Temperature (C)  Humidity  \
0                0         0.577778                 -4.050000      0.89   
1                0         1.161111                 -3.238889      0.85   
2                0         1.666667                 -3.155556      0.82   
3                0         1.711111                 -2.194444      0.82   
4                0         1.183333                 -2.744444      0.86   
...            ...              ...                       ...       ...   
86803            0         2.688889                 -0.827778      0.86   
86804            0         2.127778                 -0.655556      0.91   
86805            0         2.127778                 -1.000000      0.89   
86806            0         1.688889                 -1.388889      0.92   
86807            0         1.600000                 -2.261111      0.91   

       Wind Speed (km/h)  Wind Bearing (degrees)  Visibility (km)  Loud Cover  \
0                1

In [50]:
# Mengecek dan menghapus data yang memiliki label baru di y_test
valid_indices = [i for i in range(len(y_test)) if y_test[i] in y_train]
X_test_filtered = X_test.iloc[valid_indices]
y_test_filtered = y_test[valid_indices]

# Melatih model (dengan iterasi yang lebih tinggi)
model = LogisticRegression(max_iter=3000)
model.fit(X_train, y_train)

# Evaluasi model dengan data uji yang sudah difilter
y_pred = model.predict(X_test_filtered)
print(classification_report(y_test_filtered, y_pred))
print(confusion_matrix(y_test_filtered, y_pred))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00        10
           4       0.00      0.00      0.00        49
           5       0.00      0.00      0.00        15
           6       0.00      0.00      0.00       626
           8       0.00      0.00      0.00        34
          12       0.00      0.00      0.00       739
          13       0.00      0.00      0.00         8
          14       0.00      0.00      0.00         2
          16       0.00      0.52      0.01        46
          17       0.00      0.00      0.00      3507
          18       0.00      0.00      0.00      1687
          19       0.00      0.00      0.00      2911
          25       0.00      0.00      0.00         0

    accuracy                           0.00      9635
   macro avg       0.00      0.03      0.00      9635
weighted avg       0.00   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
