In [6]:
import pandas as pd

data = pd.read_csv('../data/air.csv', delimiter=',')
print(data.head()) 

     datetime  datetimeEpoch  tempmax  tempmin  temp  feelslikemax  \
0  2024-09-07   1.725692e+09    106.1     91.0  98.5         104.0   
1  2024-09-08   1.725779e+09    103.9     87.0  95.4         100.5   
2  2024-09-09   1.725865e+09    105.0     83.9  94.7          99.9   
3  2024-09-10   1.725952e+09    106.1     81.2  93.9         100.6   
4  2024-09-11   1.726038e+09    106.1     82.1  94.0         101.0   

   feelslikemin  feelslike   dew  humidity  ...     City  Temp_Range  \
0          88.1       95.9  51.5      21.0  ...  Phoenix        15.1   
1          84.7       92.3  48.7      21.5  ...  Phoenix        16.9   
2          81.6       90.6  41.7      16.9  ...  Phoenix        21.1   
3          79.5       89.8  39.1      15.7  ...  Phoenix        24.9   
4          80.0       90.0  40.1      15.9  ...  Phoenix        24.0   

   Heat_Index Severity_Score  Condition_Code  Month  Season  Day_of_Week  \
0   95.918703         4.4300             NaN    9.0    Fall     Saturd

In [14]:
# Step 1: Prepare the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Load your data (I assume you have already loaded it as `data`)
# For now, let's simplify and use just a few columns for illustration
# Convert 'Season' to binary (1 for Spring, 0 for others)
data['Is_clear'] = (data['conditions'] == 'Clear').astype(int)

# Choose features (you can modify this based on your needs)
features = ['tempmax', 'tempmin', 'humidity', 'Heat_Index']  # Example features
X = data[features]
y = data['Is_clear']

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train classifiers

# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)

# Random Forest Classifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

# Support Vector Classifier (SVC)
svc_clf = SVC()
svc_clf.fit(X_train, y_train)
y_pred_svc = svc_clf.predict(X_test)

# Step 4: Evaluate models
print("Logistic Regression Performance")
print(classification_report(y_test, y_pred_log_reg))

print("Random Forest Performance")
print(classification_report(y_test, y_pred_rf))

print("SVC Performance")
print(classification_report(y_test, y_pred_svc))


Logistic Regression Performance
              precision    recall  f1-score   support

           0       0.77      0.70      0.73        87
           1       0.79      0.84      0.81       113

    accuracy                           0.78       200
   macro avg       0.78      0.77      0.77       200
weighted avg       0.78      0.78      0.78       200

Random Forest Performance
              precision    recall  f1-score   support

           0       0.90      0.84      0.87        87
           1       0.88      0.93      0.91       113

    accuracy                           0.89       200
   macro avg       0.89      0.88      0.89       200
weighted avg       0.89      0.89      0.89       200

SVC Performance
              precision    recall  f1-score   support

           0       0.77      0.62      0.69        87
           1       0.75      0.86      0.80       113

    accuracy                           0.76       200
   macro avg       0.76      0.74      0.74       200
