In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Classifier 1: Logistic Model

file_path = "data_final.csv"
df = pd.read_csv(file_path)

# Ensure the variable "label" is binary (1: Cloud, 0: no cloud)
df['Label'] = df['Label'].apply(lambda x: 0 if x == -1 else 1)

X = df.drop(columns=['Label'])
Y = df['Label']

# Split dataset, 80% of them used for training and 20% of them used for testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build logistic regression model, make prediction, and check accuracy
log_model = LogisticRegression()
log_model.fit(X_train_scaled, Y_train)
Y_pred_log = log_model.predict(X_test_scaled)
accuracy_log = accuracy_score(Y_test, Y_pred_log)

print(f"Logistic Model Accuracy: {accuracy_log:.2f}")
print("Classification Report:\n", classification_report(Y_test, Y_pred_log))


Accuracy: 0.96
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.96     25494
           1       0.95      0.93      0.94     16043

    accuracy                           0.96     41537
   macro avg       0.95      0.95      0.95     41537
weighted avg       0.96      0.96      0.96     41537



In [9]:
# Classifier 2: Decision Tree

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Build decision tree classifier, make prediction, and check accuracy
decitree = DecisionTreeClassifier(criterion='gini', random_state=42)
decitree.fit(X_train_scaled, Y_train)
Y_pred_dec = decitree.predict(X_test_scaled)
accuracy_dec = accuracy_score(Y_test, Y_pred_dec)

print(f"Decision Tree Model Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(Y_test, Y_pred_dec))

# Tuning Hyperparameter and search for the best parameter for the decision tree
param_grid = {
    'max_depth': [2, 5, 10, None],   
    'min_samples_split': [2, 5, 10] 
}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, Y_train)

best_score = grid_search.best_score_

print("Best Hyperparameters:", grid_search.best_params_)
print(f"Best Accuracy Score: {best_score:.2f}")


Decision Tree Model Accuracy: 0.96
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98     25494
           1       0.97      0.97      0.97     16043

    accuracy                           0.98     41537
   macro avg       0.98      0.98      0.98     41537
weighted avg       0.98      0.98      0.98     41537

Best Hyperparameters: {'max_depth': None, 'min_samples_split': 2}
Best Accuracy Score: 0.9767551049809378


In [16]:
# Classifier 3: Random Forest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=42)  
rf.fit(X_train_scaled, Y_train)

Y_pred_rf = rf.predict(X_test_scaled)
Y_prob_rf = rf.predict_proba(X_test_scaled)[:, 1]

accuracy = accuracy_score(Y_test, Y_pred_rf)
print(f"Random Forest Model Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(Y_test, Y_pred_rf))

Model Accuracy: 0.96
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.94      0.97     25494
           1       0.92      0.99      0.95     16043

    accuracy                           0.96     41537
   macro avg       0.95      0.96      0.96     41537
weighted avg       0.96      0.96      0.96     41537

