In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# Load the dataset
data = pd.read_csv('weatherAUS.csv')

# Handling missing values
# Fill numeric columns with mean values
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

# Fill non-numeric columns with the most frequent value
non_numeric_cols = data.select_dtypes(include=['object']).columns
for column in non_numeric_cols:
    data[column] = data[column].fillna(data[column].mode()[0])

# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    if column not in ['Date', 'Location', 'RainTomorrow']:
        label_encoders[column] = LabelEncoder()
        data[column] = label_encoders[column].fit_transform(data[column])

# Encode 'RainTomorrow' as binary
data['RainTomorrow'] = data['RainTomorrow'].apply(lambda x: 1 if x == 'Yes' else 0)

# Feature Engineering: Extract year, month, day from 'Date'
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data = data.drop(columns=['Date'])

# Encode the 'Location' column
location_encoder = LabelEncoder()
data['Location'] = location_encoder.fit_transform(data['Location'])

# Define features and target
X = data.drop(columns=['RainTomorrow'])
y = data['RainTomorrow']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the model and hyperparameters
model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)

# Predict on the entire dataset (optional, if you want predictions for all rows)
all_predictions = best_model.predict(scaler.transform(X))

# Add predictions to the dataset
data['RainTomorrow_Prediction'] = all_predictions

# Save the results to a CSV file
data.to_csv('weather_predictions.csv', index=False)
print("Predictions saved to 'weather_predictions.csv'")

# Display a random sample of the predictions (optional)
print(data.sample(10))


Accuracy: 1.0000
F1-score: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22098
           1       1.00      1.00      1.00      6341

    accuracy                           1.00     28439
   macro avg       1.00      1.00      1.00     28439
weighted avg       1.00      1.00      1.00     28439

Confusion Matrix:
[[22098     0]
 [    0  6341]]
Predictions saved to 'weather_predictions.csv'
        Location  MinTemp    MaxTemp  Rainfall  Evaporation   Sunshine  \
79264         12      7.0  14.400000       4.8     4.800000   6.000000   
24129         30      9.6  17.800000       0.0     5.469824   7.624853   
108677         1      5.7  23.226784      15.0     5.469824   7.624853   
87059          8     22.8  32.300000       0.0     9.400000  11.300000   
129060        15      7.8  23.700000       0.0     1.200000   5.800000   
22269         27     15.6  20.400000       0.0     5.000000   3.100000   
10