In [1]:
!pip install -q imbalanced-learn scikit-learn pandas numpy joblib


In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/train.csv")
df.head()


Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


In [3]:
# Confirm there are no nulls
print(df.isnull().sum())

# Drop the target column to separate features
X = df.drop(columns=["Cover_Type"])  # ✅ make sure no extra columns
y = df["Cover_Type"]

print(f"✅ Number of features in X: {X.shape[1]}")  # Should be 54


Id                                    0
Elevation                             0
Aspect                                0
Slope                                 0
Horizontal_Distance_To_Hydrology      0
Vertical_Distance_To_Hydrology        0
Horizontal_Distance_To_Roadways       0
Hillshade_9am                         0
Hillshade_Noon                        0
Hillshade_3pm                         0
Horizontal_Distance_To_Fire_Points    0
Wilderness_Area1                      0
Wilderness_Area2                      0
Wilderness_Area3                      0
Wilderness_Area4                      0
Soil_Type1                            0
Soil_Type2                            0
Soil_Type3                            0
Soil_Type4                            0
Soil_Type5                            0
Soil_Type6                            0
Soil_Type7                            0
Soil_Type8                            0
Soil_Type9                            0
Soil_Type10                           0


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [17]:
from imblearn.over_sampling import SMOTE

# Feature Scaling before SMOTE
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# ✅ Save the scaler here, because it's correctly fit on 54 features
import joblib
joblib.dump(scaler, "scaler.pkl")

# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train_scaled, y_train)


In [18]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=150, random_state=42)
model.fit(X_train_bal, y_train_bal)

# Save the trained model
joblib.dump(model, "forest_cover_model.pkl")


['forest_cover_model.pkl']

In [19]:
X_test_scaled = scaler.transform(X_test)
y_pred = model.predict(X_test_scaled)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🧮 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Accuracy: 0.8617724867724867

📋 Classification Report:
               precision    recall  f1-score   support

           1       0.77      0.77      0.77       432
           2       0.80      0.66      0.72       432
           3       0.85      0.81      0.83       432
           4       0.94      0.98      0.96       432
           5       0.89      0.95      0.92       432
           6       0.83      0.89      0.86       432
           7       0.93      0.97      0.95       432

    accuracy                           0.86      3024
   macro avg       0.86      0.86      0.86      3024
weighted avg       0.86      0.86      0.86      3024


🧮 Confusion Matrix:
 [[334  60   1   0  10   1  26]
 [ 82 283  11   0  34  18   4]
 [  0   1 352  21   8  50   0]
 [  0   0   4 424   0   4   0]
 [  1   7   8   0 410   6   0]
 [  0   4  36   6   1 385   0]
 [ 14   0   0   0   0   0 418]]


In [21]:
joblib.dump(model,"forest_cover_model.pkl",compress=1)

['forest_cover_model.pkl']