In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [None]:

df = pd.read_csv("FWI Dataset.csv")
print(df)
print(df.info())
print(df.describe())
print(df.head())
print(df.tail())

In [None]:
if 'Region' in df.columns:
    print("Encoding Region column...")
    df['Region'] = df['Region'].astype('category').cat.codes

In [None]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])

In [None]:
print("Dataset shape:", df.shape)
print("Original columns:", df.columns)


In [None]:
print("Missing values before cleaning:")
print(df.isnull().sum())

In [None]:
print("Rows containing missing values:")
print(df[df.isnull().any(axis=1)])

In [None]:
df.columns = df.columns.str.strip()
print("Column names after stripping spaces:")
print(df.columns)

In [None]:
print("Cleaning string columns")
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].astype(str).str.strip()


In [None]:
print("Fixing corrupted numeric entries")
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.replace("  ", " ")
    if df[col].dtype == 'object' and df[col].str.contains(" ").any():
        df[col] = df[col].str.split(" ").str[0]

In [None]:
numeric_cols = ['Temperature','RH','Ws','Rain','FFMC','DMC','DC','ISI','BUI','FWI']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [None]:
print("Filling missing values with mode")
df['Region'] = df['Region'].fillna(df['Region'].mode()[0])
df['Classes'] = df['Classes'].fillna(df['Classes'].mode()[0])


In [None]:
print("Encoding categorical columns")
le_region = LabelEncoder()
df['Region_encoded'] = le_region.fit_transform(df['Region'])

In [None]:
le_class = LabelEncoder()
df['Classes_encoded'] = le_class.fit_transform(df['Classes'])

In [None]:
print("Missing values after cleaning:")
print(df.isnull().sum())

In [None]:
print("Final dataset shape:")
print(df.shape)

In [None]:
print("Label encoding non-numeric columns")
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()
label_encoders = {}

for col in df_encoded.columns:
    if df_encoded[col].dtype == 'object':
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
        label_encoders[col] = le

print("Selecting all numeric columns (including encoded)")
numeric_df = df_encoded.select_dtypes(include=['int64', 'float64'])

print("Plotting correlation heatmap for all numeric features")
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,8))
sns.heatmap(numeric_df.corr(), annot=True, fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
print("Plotting histograms for all numeric columns")
numeric_df = df.select_dtypes(include=['int64', 'float64'])

numeric_df.hist(figsize=(15, 12), bins=30)
plt.tight_layout()
plt.show()


In [None]:
print("Plotting correlation heatmap")
plt.figure(figsize=(10, 6))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
print("Plotting density distribution plots")
for col in numeric_df.columns:
    plt.figure(figsize=(6, 4))
    sns.kdeplot(numeric_df[col], fill=True)
    plt.title(f"Density Plot of {col}")
    plt.show()


In [None]:
print("Plotting boxplots for outlier detection")
for col in numeric_df.columns:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
    plt.show()


In [None]:
print("Performing outlier treatment using IQR method")

for col in numeric_df.columns:
    Q1 = numeric_df[col].quantile(0.25)
    Q3 = numeric_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = df[col].clip(lower, upper)

print("Outlier treatment completed.")


In [None]:
print("Checking feature consistency")
print(df.isnull().sum())
print(df.dtypes)

In [None]:
print("Plotting scatterplots for feature relationships")

plt.figure(figsize=(6, 4))
sns.scatterplot(x=df['Temperature'], y=df['FWI'])
plt.title("Temperature vs FWI")
plt.show()

plt.figure(figsize=(6, 4))
sns.scatterplot(x=df['Ws'], y=df['FWI'])
plt.title("Wind Speed (Ws) vs FWI")
plt.show()

plt.figure(figsize=(6, 4))
sns.scatterplot(x=df['RH'], y=df['FWI'])
plt.title("Relative Humidity (RH) vs FWI")
plt.show()


In [None]:
print("Head and shape of final cleaned dataset:")
print(df.head())
print(df.shape)

In [None]:
df.to_csv("FWI Cleaned.csv", index=False)
print("Saved cleaned_fwi.csv")