#### Preparasi Data dan Pembangunan Model

In [109]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

winDict = {'E ' : 0, 'NE' : 45, 'N ' : 90, 'NW' : 135, 'W ' : 180, 'SW' : 225, 'S ' : 270, 'SE' : 315}

# Preparasi data
# Sumber data : https://www.kaggle.com/datasets/greegtitan/indonesia-climate?select=climate_data.csv
dtf = pd.read_csv("climate_data/climate_data.csv")
dtf = dtf[dtf['station_id'] == 96295]
dtf = dtf[dtf['ddd_car'] != 'C ']
dtf = dtf.dropna()                          # Hapus datapoint dengan nilai fitur NaN
dtf = dtf.replace({"ddd_car" : winDict})    # Ubah data arah angin menjadi nilai derajat
dtf['isRain'] = (dtf['RR'] > 0.0)           # Nilai boolean turun hujan

features = dtf[['Tn', 'Tx', 'Tavg', 'RH_avg', 'ss', 'ff_x', 'ff_avg', 'ddd_x', 'ddd_car']].values
targets = dtf['isRain'].values

# Membagi dataset menjadi data training dan data testing
training_ftr, test_ftr, training_tgt, test_tgt = train_test_split(features, targets, random_state=229)

print(f"Jumlah total data : {dtf.shape[0]}")
print(dtf.head())

Jumlah total data : 2172
              date    Tn    Tx  Tavg  RH_avg    RR   ss  ff_x  ddd_x  ff_avg  \
150608  03-01-2010  24.0  32.2  27.5    81.0  28.2  6.1   4.0  230.0     2.0   
150609  04-01-2010  25.0  33.6  28.3    76.0   0.0  3.2   4.0  240.0     1.0   
150610  05-01-2010  24.0  31.8  26.5    84.0   0.0  0.0   4.0  250.0     0.0   
150611  06-01-2010  23.0  31.4  26.4    83.0   5.0  0.6   3.0  230.0     0.0   
150612  07-01-2010  24.0  32.6  27.8    71.0   0.0  3.1   4.0  150.0     2.0   

        ddd_car  station_id  isRain  
150608      180       96295    True  
150609       90       96295   False  
150610      180       96295   False  
150611      225       96295    True  
150612       90       96295   False  


In [123]:
# Pelatihan model dengan data training
print("Initializing...")
model = LogisticRegression(max_iter=800)
model.fit(training_ftr, training_tgt)
print("Done!")

print("\nContoh Hasil Prediksi :")
print("Data     : ", test_ftr[0])
print("Prediksi : ", model.predict_proba([test_ftr[0]])[:, 1])

Initializing...
Done!

Contoh Hasil Prediksi :
Data     :  [ 23.   32.4  26.6  80.    6.9   6.    2.  120.  315. ]
Prediksi :  [0.28291101]


#### Testing dan Evaluasi

In [124]:
prediction_results = model.predict(test_ftr)

print("accuracy  : ", accuracy_score(test_tgt, prediction_results))
print("precision : ", precision_score(test_tgt, prediction_results))
print("recall    : ", recall_score(test_tgt, prediction_results))
print("f1 score  : ", f1_score(test_tgt, prediction_results))

con_mat = confusion_matrix(test_tgt, prediction_results)
con_mat[0], con_mat[1] = con_mat[0][::-1], con_mat[1][::-1]
con_mat = con_mat[::-1]

print("\nconfusion matrix :")
print(con_mat)

accuracy  :  0.7476979742173112
precision :  0.7589285714285714
recall    :  0.819935691318328
f1 score  :  0.7882534775888718

confusion matrix :
[[255  56]
 [ 81 151]]


In [125]:
print("\nModel coefficients :")
print(model.coef_, model.intercept_)


Model coefficients :
[[ 0.0548035  -0.13336999  0.28970931  0.18437431 -0.01488661  0.0110005
  -0.32888764  0.00223563 -0.0040738 ]] [-18.61611065]
