# Final Exploratory Data Analysis (EDA)
Dataset: covtype.csv

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
plt.style.use("default")

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/covtype.csv")
df.head()
df=df.head(100000)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:

df = df.drop_duplicates()


In [None]:

df["Cover_Type"].value_counts()


In [None]:

plt.figure(figsize=(6,4))
sns.countplot(x="Cover_Type", data=df)
plt.title("Distribution of Cover Types")
plt.show()


In [None]:

num_cols = df.select_dtypes(include=np.number).columns
df[num_cols].hist(figsize=(15,12), bins=30)
plt.suptitle("Histogram of Numerical Features")
plt.show()


In [None]:

plt.figure(figsize=(14,10))
corr = df.corr()
sns.heatmap(corr, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()


In [None]:

plt.figure(figsize=(6,4))
sns.boxplot(x="Cover_Type", y="Elevation", data=df)
plt.title("Elevation vs Cover Type")
plt.show()


In [None]:
corr_with_target = df.corr()["Cover_Type"].sort_values(ascending=False)

plt.figure(figsize=(8,6))
corr_with_target.drop("Cover_Type").plot(kind="bar")
plt.title("Correlation of Features with Cover_Type")
plt.ylabel("Correlation value")
plt.xlabel("Features")
plt.show()

In [None]:
plt.figure(figsize=(6,10))
sns.heatmap(df.corr()[["Cover_Type"]], annot=True, cmap="coolwarm")
plt.title("Correlation of Features with Cover_Type")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def basic_info(df):
    print(df.info())
    print(df.describe())

def check_missing(df):
    print(df.isnull().sum())

def plot_target_distribution(df, target):
    sns.countplot(x=target, data=df)
    plt.title("Target Distribution")
    plt.show()

def correlation_plot(df):
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.corr(), cmap="coolwarm", annot=False)
    plt.title("Correlation Heatmap")
    plt.show()


In [None]:
def numerical_summary(df):
    num_df = df.select_dtypes(include=np.number)
    return num_df.describe()

def categorical_summary(df):
    cat_df = df.select_dtypes(exclude=np.number)
    return cat_df.describe()

def skewness(df):
    return df.select_dtypes(include=np.number).skew()
def target_distribution(df, target):
    print(df[target].value_counts())
    plt.figure(figsize=(6,4))
    sns.countplot(x=df[target])
    plt.title("Target Distribution")
    plt.show()
def boxplots(df):
    num_cols = df.select_dtypes(include=np.number).columns
    for col in num_cols:
        plt.figure(figsize=(5,3))
        sns.boxplot(x=df[col])
        plt.title(col)
        sns.despine()
        plt.show()
def binary_feature_counts(df, columns, title):
    counts = df[columns].sum().sort_values(ascending=False)
    plt.figure(figsize=(10,5))
    counts.plot(kind="bar")
    plt.title(title)
    plt.show()
    return counts
#binary_feature_counts(data, wilderness_cols, "Wilderness Area Count")
#binary_feature_counts(data, soil_cols, "Soil Type Count")
def violin_plots(df, features, target):
    for col in features:
        plt.figure(figsize=(6,4))
        sns.violinplot(x=target, y=col, data=df)
        plt.title(col)
        plt.show()

def violin(data):
  # plot bg
  # Extracting all numerical features from data
  num_fea = data.iloc[:, :10]

  # extracting all binary features from data
  binary_fea = data.iloc[:, 10:-1]

  # Splitting
  Wild_data, Soil_data = binary_fea.iloc[:,:4], binary_fea.iloc[:,4:]
  sns.set_style("darkgrid", {'grid.color': '.1'})

  # setting target variable
  target = data['Cover_Type']
  # features to be compared with target variable
  features = Wild_data.columns


  # loop for plotting Violin Plot for each features in the data
  for i in range(0, len(features)):

      #figure size
      plt.subplots(figsize=(13, 9))

      # Plot violin for i feature for every class in target
      sns.violinplot(data = Wild_data, x=target, y = features[i])

      # x-axis label size
      plt.xticks(size = 15)
      # y-axis label size
      plt.yticks(size = 16)

      # Horizontal axis Label
      plt.xlabel('Forest Cover Types', size = 17)
      # Vertical axis Label
      plt.ylabel(features[i], size = 17)

      # display plot
      plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

def feature_importance(df, target, model=None):
    X = df.drop(target, axis=1)
    y = df[target]

    if model is None:
        model = RandomForestClassifier(n_jobs=-1, random_state=42)

    model.fit(X, y)
    imp = pd.DataFrame({
        "Feature": X.columns,
        "Importance": model.feature_importances_
    }).sort_values(by="Importance", ascending=False)

    return imp


In [None]:
data=df

numerical_summary(data)

skewness(data)

target_distribution(data, "Cover_Type")

boxplots(data)

importance = feature_importance(data, "Cover_Type")
importance.head(10)

#violin_plots(df, features, target)
violin(df)
'''
X_train, X_test, y_train, y_test = scale_and_split(data, "Cover_Type")

from sklearn.ensemble import RandomForestClassifier
model_evaluation(RandomForestClassifier(n_jobs=-1), X_train, X_test, y_train, y_test)
'''

In [None]:
# importing model for feature importance
from sklearn.ensemble import ExtraTreesClassifier

# passing the model
model = ExtraTreesClassifier(random_state = 53)

# feeding all our features to var 'X'
X = df.iloc[:,:-1]
# feeding our target variable to var 'y'
y = df['Cover_Type']

# training the model
model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
ETC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['ETC']).sort_values('ETC', ascending=False)

# removing traces of this model
model = None

# show top 10 features
ETC_feature_importances.head(10)

In [None]:
# importing model for feature importance
from sklearn.ensemble import RandomForestClassifier

# passing the model
model = RandomForestClassifier(random_state = 53)

# training the model
model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
RFC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['RFC']).sort_values('RFC', ascending=False)

# removing traces of this model
model = None

# show top 10 features
RFC_feature_importances.head(10)

In [None]:
# importing model for feature importance
from sklearn.ensemble import AdaBoostClassifier

# passing the model
model = AdaBoostClassifier(random_state = 53)

model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
ADB_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['ADB']).sort_values('ADB', ascending=False)

# removing traces of this model
model = None

ADB_feature_importances.head(10)

In [None]:
# importing model for feature importance
from sklearn.ensemble import GradientBoostingClassifier

# passing the model
model = GradientBoostingClassifier(random_state = 53)

# training the model
model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
GBC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['GBC']).sort_values('GBC', ascending=False)

# removing traces of this model
model = None

# show top 10 features
GBC_feature_importances.head(10)

In [None]:
## feeding top 20 features in a variable as dataframe including target variable

## AdaBoost Sample
#sample = data[['Wilderness_Area4', 'Elevation','Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Aspect','Wilderness_Area4', 'Soil_Type4', 'Soil_Type10' 'Cover_Type']]

sample = df[['Elevation','Horizontal_Distance_To_Roadways','Horizontal_Distance_To_Fire_Points','Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Aspect','Wilderness_Area4',
            'Hillshade_Noon','Hillshade_3pm','Hillshade_9am','Slope','Soil_Type22','Soil_Type10','Soil_Type4','Soil_Type34','Soil_Type34','Wilderness_Area3','Soil_Type12',
            'Soil_Type2','Wilderness_Area1', 'Cover_Type']]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# feeding sample features to var 'X'
X = sample.iloc[:,:-1]
# feeding our target variable to var 'y'
y = sample['Cover_Type']

#X = df.drop("Cover_Type", axis=1)
#y = df["Cover_Type"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, matthews_corrcoef, roc_auc_score,
    confusion_matrix, classification_report
)

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, n_jobs=-1),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "XGBoost": XGBClassifier(
        objective="multi:softprob",
        num_class=len(y.unique()),
        eval_metric="mlogloss",
        random_state=42,
        n_estimators=100,
        learning_rate=0.1
    )
}


In [None]:
def evaluate_model(name, model, Xtr, Xte, ytr, yte):
    model.fit(Xtr, ytr)
    y_pred = model.predict(Xte)
    y_prob = model.predict_proba(Xte)

    acc = accuracy_score(yte, y_pred)
    prec = precision_score(yte, y_pred, average="weighted")
    rec = recall_score(yte, y_pred, average="weighted")
    f1 = f1_score(yte, y_pred, average="weighted")
    mcc = matthews_corrcoef(yte, y_pred)
    auc = roc_auc_score(yte, y_prob, multi_class="ovr")

    print(f"\n{name}")
    print(classification_report(yte, y_pred))

    '''
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=False, cmap="Blues")
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()
    '''
    return [name, acc, auc, prec, rec, f1, mcc]

In [None]:
# Re-initialize X_train, X_test, y_train, y_test and scaled versions
# to ensure consistent state for each run of this cell.
# This ensures y_train and y_test start as 1-indexed Series from `y`.
# Assuming X and y are already defined from previous cells.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

results = []

for name, model in models.items():
    current_y_train_for_model = y_train  # Start with the original Series
    current_y_test_for_model = y_test    # Start with the original Series

    # Temporarily adjust y_train and y_test to be 0-indexed for XGBoost
    if name == "XGBoost":
        # Convert y_train and y_test to NumPy arrays and ensure 0-indexing
        # Assuming original_y_train is 1-indexed (1-7), subtract 1.
        # This will correctly transform [1,2,..,7] to [0,1,..,6]
        current_y_train_for_model = (y_train - 1).values
        current_y_test_for_model = (y_test - 1).values

    if name in ["Logistic Regression", "KNN"]:
        res = evaluate_model(name, model, X_train_scaled, X_test_scaled, current_y_train_for_model, current_y_test_for_model)
    else:
        # Ensure X_train and X_test are passed directly for models that don't need scaling
        res = evaluate_model(name, model, X_train, X_test, current_y_train_for_model, current_y_test_for_model)

    results.append(res)

columns = ["Model", "Accuracy", "AUC", "Precision", "Recall", "F1 Score", "MCC"]
results_df = pd.DataFrame(results, columns=columns)
#results_df
results_df.to_csv("/content/drive/MyDrive/Colab Notebooks/model_comparison_results_sample.csv", index=False)

In [None]:
rf_model = models["Random Forest"]
rf_model.fit(X_train, y_train)

importances = pd.Series(rf_model.feature_importances_, index=X.columns)
importances.sort_values(ascending=False).head(15)



## Model Performance Observations

| Model | Observation |
|-------|-------------|
| Logistic Regression | |
| Decision Tree | |
| KNN | |
| Naive Bayes | |
| Random Forest | |
| XGBoost | |


## Key Insights
1. Dataset contains no missing values.
2. Cover_Type is imbalanced.
3. Elevation shows separation among classes.
4. Some distance features are skewed.
5. Correlation exists among several variables.
