# MILESTONE 1

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import pickle

In [None]:

df = pd.read_csv("FWI Dataset.csv")
print(df)
print(df.info())
print(df.describe())
print(df.head())
print(df.tail())

In [None]:
if 'Region' in df.columns:
    print("Encoding Region column...")
    df['Region'] = df['Region'].astype('category').cat.codes

In [None]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])

In [None]:
print("Dataset shape:", df.shape)
print("Original columns:", df.columns)


In [None]:
print("Missing values before cleaning:")
print(df.isnull().sum())

In [None]:
print("Rows containing missing values:")
print(df[df.isnull().any(axis=1)])

In [None]:
df.columns = df.columns.str.strip()
print("Column names after stripping spaces:")
print(df.columns)

In [None]:
print("Cleaning string columns")
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].astype(str).str.strip()


In [None]:
print("Fixing corrupted numeric entries")
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.replace("  ", " ")
    if df[col].dtype == 'object' and df[col].str.contains(" ").any():
        df[col] = df[col].str.split(" ").str[0]

In [None]:
numeric_cols = ['Temperature','RH','Ws','Rain','FFMC','DMC','DC','ISI','BUI','FWI']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [None]:
print("Filling missing values with mode")
df['Region'] = df['Region'].fillna(df['Region'].mode()[0])
df['Classes'] = df['Classes'].fillna(df['Classes'].mode()[0])


In [None]:
print("Encoding categorical columns")
le_region = LabelEncoder()
df['Region_encoded'] = le_region.fit_transform(df['Region'])

In [None]:
le_class = LabelEncoder()
df['Classes_encoded'] = le_class.fit_transform(df['Classes'])

In [None]:
print("Missing values after cleaning:")
print(df.isnull().sum())

In [None]:
print("Final dataset shape:")
print(df.shape)

In [None]:
print("Label encoding non-numeric columns")
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()
label_encoders = {}

for col in df_encoded.columns:
    if df_encoded[col].dtype == 'object':
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
        label_encoders[col] = le

print("Selecting all numeric columns (including encoded)")
numeric_df = df_encoded.select_dtypes(include=['int64', 'float64'])

print("Plotting correlation heatmap for all numeric features")
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,8))
sns.heatmap(numeric_df.corr(), annot=True, fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
print("Plotting histograms for all numeric columns")
numeric_df = df.select_dtypes(include=['int64', 'float64'])

numeric_df.hist(figsize=(15, 12), bins=30)
plt.tight_layout()
plt.show()


In [None]:
print("Plotting correlation heatmap")
plt.figure(figsize=(10, 6))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
print("Plotting density distribution plots (single frame)")

cols = numeric_df.columns
n_cols = 3                       
n_rows = int(np.ceil(len(cols) / n_cols))

plt.figure(figsize=(15, 5 * n_rows))

for i, col in enumerate(cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.kdeplot(numeric_df[col], fill=True)
    plt.title(col)

plt.tight_layout()
plt.show()



In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

print("Plotting boxplots for outlier detection (single frame)")

try:
    df
except NameError:
    df = pd.read_csv("FWI Cleaned.csv")

numeric_df = df.select_dtypes(include=[np.number])

cols = numeric_df.columns
n_cols = 3
n_rows = int(np.ceil(len(cols) / n_cols))

plt.figure(figsize=(15, 5 * n_rows))

for i, col in enumerate(cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.boxplot(x=numeric_df[col])
    plt.title(col)

plt.tight_layout()
plt.show()


In [None]:
print("Performing outlier treatment using IQR method")

for col in numeric_df.columns:
    Q1 = numeric_df[col].quantile(0.25)
    Q3 = numeric_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = df[col].clip(lower, upper)

print("Outlier treatment completed.")


In [None]:
print("Checking feature consistency")
print(df.isnull().sum())
print(df.dtypes)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

print("Plotting scatterplots for feature relationships")

try:
    df
except NameError:
    df = pd.read_csv("FWI Cleaned.csv")

plt.figure(figsize=(15, 4))

plt.subplot(1, 3, 1)
sns.scatterplot(x=df['Temperature'], y=df['FWI'])
plt.title("Temperature vs FWI")

plt.subplot(1, 3, 2)
sns.scatterplot(x=df['Ws'], y=df['FWI'])
plt.title("Wind Speed (Ws) vs FWI")

plt.subplot(1, 3, 3)
sns.scatterplot(x=df['RH'], y=df['FWI'])
plt.title("Relative Humidity (RH) vs FWI")

plt.tight_layout()
plt.show()


In [None]:
print("Head and shape of final cleaned dataset:")
print(df.head())
print(df.shape)

In [None]:
df.to_csv("FWI Cleaned.csv", index=False)
print("Saved cleaned_fwi.csv")

# MILESTONE 2

In [None]:
df = pd.read_csv("FWI Cleaned.csv")

print("Dataset loaded successfully")
print("Shape:", df.shape)

df.head()

In [None]:
df = df.dropna(subset=["FWI"])

print("After removing missing FWI values:")
print("Shape:", df.shape)

In [None]:
target = "FWI"

features = [
    "Temperature", "RH", "Ws", "Rain",
    "FFMC", "DMC", "DC", "ISI", "BUI"
]

X = df[features]
y = df[target]

print("Selected Features:", features)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed")

In [None]:
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("scaler.pkl saved successfully")

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso()
}

results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    results.append({
        "Model": name,
        "MAE": mean_absolute_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "R2 Score": r2_score(y_test, y_pred)
    })

results_df = pd.DataFrame(results)
results_df

In [None]:
results_df.sort_values(by="R2 Score", ascending=False)

In [None]:
ridge = Ridge()

param_grid = {
    "alpha": [0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(
    estimator=ridge,
    param_grid=param_grid,
    cv=5,
    scoring="r2"
)

grid.fit(X_train_scaled, y_train)

print("Best Alpha:", grid.best_params_)
print("Best Cross-Validated R2 Score:", grid.best_score_)


In [None]:
best_ridge = grid.best_estimator_

y_pred = best_ridge.predict(X_test_scaled)

print("Final Ridge Regression Performance:")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred))
)
print("R2 Score:", r2_score(y_test, y_pred))


In [None]:
with open("ridge.pkl", "wb") as f:
    pickle.dump(best_ridge, f)

print("ridge.pkl saved successfully")

In [None]:
print(
    "Ridge Regression was selected as the final model because it "
    "handles multicollinearity among correlated weather features "
    "and demonstrated better generalization performance during "
    "cross-validation."
)

In [None]:
final_summary = {
    "Model": "Ridge Regression",
    "Best Alpha": grid.best_params_["alpha"],
    "MAE": mean_absolute_error(y_test, y_pred),
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
    "R2 Score": r2_score(y_test, y_pred)
}

final_summary

In [None]:
import pickle

with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

with open("ridge.pkl", "rb") as f:
    ridge_model = pickle.load(f)

sample = X_test.iloc[[0]]
sample_scaled = scaler.transform(sample)
prediction = ridge_model.predict(sample_scaled)

print(scaler)
print(ridge_model)

print("Predicted FWI:", prediction)


# MILESTONE 3

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

with open("ridge.pkl", "rb") as f:
    ridge_model = pickle.load(f)

print("Scaler and Ridge model loaded successfully")

In [None]:
df = pd.read_csv("FWI Cleaned.csv")

df = df.dropna(subset=["FWI"])

print("Dataset loaded for evaluation")
print("Shape:", df.shape)

In [None]:
features = [
    "Temperature", "RH", "Ws", "Rain",
    "FFMC", "DMC", "DC", "ISI", "BUI"
]

target = "FWI"

X = df[features]
y = df[target]

print("Features and target defined")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print("Train-test split completed")
print("Test set size:", X_test.shape)

In [None]:
X_train_scaled = scaler.transform(X_train)

print("Training data scaled using saved scaler")

In [None]:
y_train_pred = ridge_model.predict(X_train_scaled)

train_r2 = r2_score(y_train, y_train_pred)
print("Training R2 Score:", train_r2)

In [None]:
test_r2 = r2_score(y_test, y_pred)
print("Testing R2 Score:", test_r2)

In [None]:
X_test_scaled = scaler.transform(X_test)

print("Test data scaled using saved scaler")

In [None]:
y_pred = ridge_model.predict(X_test_scaled)

print("Predictions generated successfully")

In [None]:
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Model Evaluation Metrics:")
print("MAE :", mae)
print("RMSE:", rmse)
print("R2  :", r2)

In [None]:
residuals = y_test - y_pred

print("Residual analysis completed")
print("Residual mean:", residuals.mean())

In [None]:
plt.figure(figsize=(6, 4))
sns.histplot(residuals, kde=True)
plt.axvline(0, color='red', linestyle='--')
plt.title("Residual Distribution")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         linestyle="--")

plt.xlabel("Actual FWI")
plt.ylabel("Predicted FWI")
plt.title("Actual vs Predicted FWI")
plt.show()

In [None]:
residuals = y_test - y_pred

plt.figure(figsize=(6, 4))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(0, linestyle="--")
plt.xlabel("Predicted FWI")
plt.ylabel("Residuals (Actual - Predicted)")
plt.title("Residuals vs Predicted FWI")
plt.show()

In [None]:
print("Final Ridge alpha used:", ridge_model.alpha)

In [None]:
evaluation_summary = pd.DataFrame({
    "Metric": ["MAE", "RMSE", "R2 Score"],
    "Value": [mae, rmse, r2]
})

evaluation_summary

In [None]:
print(
    "The Ridge Regression model demonstrates strong generalization "
    "on unseen data. Residuals are centered around zero, indicating "
    "unbiased predictions. Hyperparameter tuning improved performance "
    "by controlling multicollinearity among weather features."
)