In [1]:
# 1) Install kaggle
!pip install --quiet kaggle


In [2]:
from google.colab import files
uploaded = files.upload()  # choose kaggle.json from your machine
# After upload, move it to the right place and set permissions
!mkdir -p /root/.kaggle
!mv kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json


Saving kaggle.json to kaggle.json


In [3]:
# Download into current runtime directory
!kaggle competitions download -c forest-cover-type-prediction
# List downloaded zip(s)
!ls -lah


Downloading forest-cover-type-prediction.zip to /content
  0% 0.00/25.3M [00:00<?, ?B/s]
100% 25.3M/25.3M [00:00<00:00, 2.14GB/s]
total 26M
drwxr-xr-x 1 root root 4.0K Oct 24 08:52 .
drwxr-xr-x 1 root root 4.0K Oct 24 08:47 ..
drwxr-xr-x 4 root root 4.0K Oct 22 13:38 .config
-rw-r--r-- 1 root root  26M Dec 11  2019 forest-cover-type-prediction.zip
drwxr-xr-x 1 root root 4.0K Oct 22 13:39 sample_data


In [4]:
!unzip -q forest-cover-type-prediction.zip -d forest_data
!ls -lah forest_data


total 93M
drwxr-xr-x 2 root root 4.0K Oct 24 08:53 .
drwxr-xr-x 1 root root 4.0K Oct 24 08:53 ..
-rw-r--r-- 1 root root 4.8M Dec 11  2019 sampleSubmission.csv
-rw-r--r-- 1 root root 1.2M Dec 11  2019 sampleSubmission.csv.zip
-rw-r--r-- 1 root root  161 Dec 11  2019 test3.csv
-rw-r--r-- 1 root root  73M Dec 11  2019 test.csv
-rw-r--r-- 1 root root  12M Dec 11  2019 test.csv.zip
-rw-r--r-- 1 root root 2.0M Dec 11  2019 train.csv
-rw-r--r-- 1 root root 333K Dec 11  2019 train.csv.zip


In [5]:
import pandas as pd

train = pd.read_csv("forest_data/train.csv")
test = pd.read_csv("forest_data/test.csv")
sample = pd.read_csv("forest_data/sampleSubmission.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Sample submission shape:", sample.shape)

train.head()


Train shape: (15120, 56)
Test shape: (565892, 55)
Sample submission shape: (565892, 2)


Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


In [6]:
# ===============================================
# 🧹 DATA PREPROCESSING for Forest Cover Dataset
# ===============================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
train = pd.read_csv("forest_data/train.csv")
test = pd.read_csv("forest_data/test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()


Train shape: (15120, 56)
Test shape: (565892, 55)


Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


In [7]:
# ------------------------------------------------
# 1️⃣ Inspect the dataset
# ------------------------------------------------

print("Missing values per column:\n", train.isnull().sum().head())
print("\nData info:")
train.info()


Missing values per column:
 Id                                  0
Elevation                           0
Aspect                              0
Slope                               0
Horizontal_Distance_To_Hydrology    0
dtype: int64

Data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 56 columns):
 #   Column                              Non-Null Count  Dtype
---  ------                              --------------  -----
 0   Id                                  15120 non-null  int64
 1   Elevation                           15120 non-null  int64
 2   Aspect                              15120 non-null  int64
 3   Slope                               15120 non-null  int64
 4   Horizontal_Distance_To_Hydrology    15120 non-null  int64
 5   Vertical_Distance_To_Hydrology      15120 non-null  int64
 6   Horizontal_Distance_To_Roadways     15120 non-null  int64
 7   Hillshade_9am                       15120 non-null  int64
 8   Hillshade_Noo

In [8]:
# ------------------------------------------------
# 3️⃣ Separate features and target
# ------------------------------------------------

X = train.drop(columns=["Cover_Type"])
y = train["Cover_Type"]

print("Feature shape:", X.shape)
print("Target distribution:\n", y.value_counts())


Feature shape: (15120, 55)
Target distribution:
 Cover_Type
5    2160
2    2160
1    2160
7    2160
3    2160
6    2160
4    2160
Name: count, dtype: int64


In [9]:
# ------------------------------------------------
# 4️⃣ Identify continuous (numeric) columns
# ------------------------------------------------

continuous_cols = [
    'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
    'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points'
]

print("Continuous features:\n", continuous_cols)


Continuous features:
 ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']


In [10]:
# ------------------------------------------------
# 5️⃣ Scale continuous features
# ------------------------------------------------

scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[continuous_cols] = scaler.fit_transform(X[continuous_cols])

print("✅ Continuous columns scaled successfully.")


✅ Continuous columns scaled successfully.


In [11]:
# ------------------------------------------------
# 6️⃣ Train-test split for validation
# ------------------------------------------------

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)


Training set shape: (12096, 55)
Validation set shape: (3024, 55)


In [12]:
# ------------------------------------------------
# 7️⃣ Apply same scaling to test data
# ------------------------------------------------

test_scaled = test.copy()
test_scaled[continuous_cols] = scaler.transform(test[continuous_cols])

print("✅ Test data scaled and ready for prediction.")


✅ Test data scaled and ready for prediction.


In [13]:
# ------------------------------------------------
# ✅ Final preprocessed data ready for modeling
# ------------------------------------------------

print("✅ Preprocessing complete!")
print("Train columns:", X_train.columns.tolist()[:10], "...")
print("Train data ready for model training.")


✅ Preprocessing complete!
Train columns: ['Id', 'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'] ...
Train data ready for model training.
