In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset
file_path = 'weatherAUS.csv'  # make sure the filename matches the uploaded file
data = pd.read_csv(file_path)

# Drop columns with many missing values and rows where target is missing
data = data.drop(columns=['Date', 'Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm'])
data = data.dropna(subset=['RainTomorrow'])

# Impute missing values for numerical columns
num_imputer = SimpleImputer(strategy='mean')
data[data.select_dtypes(include=['float64']).columns] = num_imputer.fit_transform(data.select_dtypes(include=['float64']))

# Impute missing values for categorical columns and encode them
cat_imputer = SimpleImputer(strategy='most_frequent')
data[data.select_dtypes(include=['object']).columns] = cat_imputer.fit_transform(data.select_dtypes(include=['object']))

# Encode categorical features
label_encoder = LabelEncoder()
for col in data.select_dtypes(include=['object']).columns:
    data[col] = label_encoder.fit_transform(data[col])

# Separate features and target variable
X = data.drop(columns=['RainTomorrow'])
y = data['RainTomorrow'].astype(int)  # Encode target variable to binary

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train.shape, X_test.shape, y_train.value_counts(), y_test.value_counts()


((99535, 17),
 (42658, 17),
 RainTomorrow
 0    77183
 1    22352
 Name: count, dtype: int64,
 RainTomorrow
 0    33133
 1     9525
 Name: count, dtype: int64)

In [2]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train distribution:\n", y_train.value_counts())
print("y_test distribution:\n", y_test.value_counts())


X_train shape: (99535, 17)
X_test shape: (42658, 17)
y_train distribution:
 RainTomorrow
0    77183
1    22352
Name: count, dtype: int64
y_test distribution:
 RainTomorrow
0    33133
1     9525
Name: count, dtype: int64


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Train the unbalanced model
baseline_model = RandomForestClassifier(random_state=42)
baseline_model.fit(X_train, y_train)

# Predictions
y_pred_baseline = baseline_model.predict(X_test)
y_pred_proba_baseline = baseline_model.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy_baseline = accuracy_score(y_test, y_pred_baseline)
f1_baseline = f1_score(y_test, y_pred_baseline)
auc_baseline = roc_auc_score(y_test, y_pred_proba_baseline)

print("Baseline Model Performance:")
print("Accuracy:", accuracy_baseline)
print("F1 Score:", f1_baseline)
print("AUC:", auc_baseline)


Baseline Model Performance:
Accuracy: 0.8529466922968728
F1 Score: 0.6056453133840448
AUC: 0.876140384815101


In [22]:
!pip install imbalanced-learn




In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from sklearn.model_selection import train_test_split

# Separate features and target variable
X = data.drop(columns=['RainTomorrow'])
y = data['RainTomorrow'].astype(int)  # Encode target variable to binary

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to train and evaluate your model (make sure this function is defined)

results['No Sampling'] = train_and_evaluate(X_train, y_train, X_test, y_test)


# Function to train and evaluate the model
def train_and_evaluate(X_train, y_train, X_test, y_test, sampler=None, class_weight=None):
    # Define model with optional class weights
    model = RandomForestClassifier(random_state=42, class_weight=class_weight)

    # Apply sampling if specified
    if sampler:
        X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)
    else:
        X_train_res, y_train_res = X_train, y_train

    # Train the model
    model.fit(X_train_res, y_train_res)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)

    return accuracy, f1, auc


# Base model without sampling
results['No Sampling'] = train_and_evaluate(X_train, y_train, X_test, y_test)

# Random Oversampling
oversampler = RandomOverSampler(random_state=42)
results['Random Oversampling'] = train_and_evaluate(X_train, y_train, X_test, y_test, sampler=oversampler)

# Random Undersampling
undersampler = RandomUnderSampler(random_state=42)
results['Random Undersampling'] = train_and_evaluate(X_train, y_train, X_test, y_test, sampler=undersampler)

# SMOTE
smote_sampler = SMOTE(random_state=42)
results['SMOTE'] = train_and_evaluate(X_train, y_train, X_test, y_test, sampler=smote_sampler)

# Tomek Links
tomek_sampler = TomekLinks()
results['Tomek Links'] = train_and_evaluate(X_train, y_train, X_test, y_test, sampler=tomek_sampler)

# Class Weighing
results['Class Weighing'] = train_and_evaluate(X_train, y_train, X_test, y_test, class_weight='balanced')

# Display results

results['No Sampling'] = train_and_evaluate(X_train, y_train, X_test, y_test)


NameError: name 'data' is not defined