ဒီ data set က ကော်လိုရာဒိုက Roosevelt Nation Forest ရဲ့ 30x30 မီတာ ပတ်လည် cell တွေမှာ ဘယ်လို သစ်တော အမျိုးအစားတွေလည်း ဆိုတာခန့်မှန်းဖို့ ဖြစ်တယ်။ target class 7 မျိုးရှိပြီး အောက်ပါ သစ်တော အမျိုးအစားတွေဖြစ်ပါတယ်။

Class	Cover Type
1	Spruce/Fir
2	Lodgepole Pine
3	Ponderosa Pine
4	Cottonwood/Willow
5	Aspen
6	Douglas-fir
7	Krummholz

Feature အနေနဲ့ ၁၀ ခု ရှိပြီး အောက်ဆုံးနှစ်ခုက one hot encoding လုပ်ထားပြီး column ၄၄ ခု ရှိတယ်။

Features (54 ခု)

Quantitative Features (10 ခု)
Feature	Description
Elevation	ပင်လယ်ရေမျက်နှာပြင် အမြင့် (meters)
Aspect	Azimuth direction (0-360°)
Slope	Steepness (degrees)
Horizontal_Distance_To_Hydrology	ရေအနီးဆုံး အကွာအဝေး
Vertical_Distance_To_Hydrology	ရေနဲ့ ဒေါင်လိုက် အမြင့်ခြား
Horizontal_Distance_To_Roadways	လမ်းအနီးဆုံး အကွာအဝေး
Hillshade_9am, Hillshade_Noon, Hillshade_3pm	အရိပ် index (0-255)
Horizontal_Distance_To_Fire_Points	မီးလောင်ရာ အနီးဆုံး

Binary Features (44 ခု)
Wilderness_Area (4 columns) - One-hot encoded
Soil_Type (40 columns) - One-hot encoded

In [None]:
# =============================================================================
# Forest Cover Type Prediction - Kaggle Competition
# =============================================================================
# ဒီ code က Kaggle competition အတွက် submission file ထုတ်ပေးပါမယ်
# Dataset: https://www.kaggle.com/c/forest-cover-type-prediction
# =============================================================================

# %% [1] Library imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# %% [2] Data Loading
# Kaggle notebook မှာဆိုရင် path က '/kaggle/input/forest-cover-type-prediction/' ဖြစ်မယ်
# Local မှာဆိုရင် data download လုပ်ထားတဲ့ path ထည့်ပါ

# Kaggle environment
train_path = '/kaggle/input/forest-cover-type-prediction/train.csv'
test_path = '/kaggle/input/forest-cover-type-prediction/test.csv'

# Data load လုပ်မယ်
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("Training data shape:", train_df.shape)  # (15120, 56) - 54 features + Id + Cover_Type
print("Test data shape:", test_df.shape)        # (565892, 55) - 54 features + Id

# %% [3] Data Exploration
# Training data ကို ကြည့်မယ်
print("\n=== Training Data Info ===")
print(train_df.head())
print("\n=== Column Names ===")
print(train_df.columns.tolist())
print("\n=== Target Distribution ===")
print(train_df['Cover_Type'].value_counts().sort_index())

# %% [4] Data Preprocessing
# Feature နဲ့ Target ခွဲမယ်

# Training data
X = train_df.drop(['Id', 'Cover_Type'], axis=1)  # Id နဲ့ target ဖယ်မယ်
y = train_df['Cover_Type']                        # Target column

# Test data (submission အတွက်)
test_ids = test_df['Id']                          # Id သိမ်းထားမယ် (submission အတွက်)
X_test_final = test_df.drop(['Id'], axis=1)       # Id ဖယ်မယ်

print("\nFeatures shape:", X.shape)      # (15120, 54)
print("Target shape:", y.shape)          # (15120,)
print("Test features shape:", X_test_final.shape)  # (565892, 54)

# %% [5] Feature Scaling
# Numerical features တွေကို scale လုပ်မယ် (optional - Random Forest မှာ မလိုအပ်ပေမယ့် other models အတွက် ကောင်းတယ်)

scaler = StandardScaler()

# Numerical columns (first 10 columns)
numerical_cols = X.columns[:10].tolist()
print("\nNumerical columns:", numerical_cols)

# Scale လုပ်မယ်
X_scaled = X.copy()
X_test_scaled = X_test_final.copy()

X_scaled[numerical_cols] = scaler.fit_transform(X[numerical_cols])
X_test_scaled[numerical_cols] = scaler.transform(X_test_final[numerical_cols])

# %% [6] Train/Validation Split
# Local validation အတွက် data ခွဲမယ်

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, 
    test_size=0.2,      # 20% validation အတွက်
    random_state=42,    # Reproducibility အတွက်
    stratify=y          # Class balance ထိန်းဖို့
)

print("\nTrain size:", X_train.shape[0])  # 12096
print("Validation size:", X_val.shape[0]) # 3024

# %% [7] Model Training - Random Forest
# Random Forest Classifier သုံးမယ် (Decision Tree များစုပေါင်းထားတာ)

print("\n=== Training Random Forest ===")

rf_model = RandomForestClassifier(
    n_estimators=200,       # Tree အရေအတွက် 200 ခု
    max_depth=30,           # Tree တစ်ခုရဲ့ အမြင့်ဆုံး depth
    min_samples_split=2,    # Split လုပ်ဖို့ လိုအပ်တဲ့ minimum samples
    min_samples_leaf=1,     # Leaf node မှာ ရှိရမယ့် minimum samples
    max_features='sqrt',    # Split တိုင်းမှာ သုံးမယ့် features အရေအတွက်
    n_jobs=-1,              # CPU core အကုန်သုံးမယ်
    random_state=42,
    verbose=1
)

# Model train လုပ်မယ်
rf_model.fit(X_train, y_train)

# %% [8] Model Evaluation
# Validation data နဲ့ စမ်းမယ်

# Validation predictions
y_val_pred = rf_model.predict(X_val)

# Accuracy တွက်မယ်
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"\n=== Validation Results ===")
print(f"Validation Accuracy: {val_accuracy:.4f} ({val_accuracy*100:.2f}%)")

# Classification report
print("\n=== Classification Report ===")
print(classification_report(y_val, y_val_pred))

# %% [9] Cross Validation
# 5-Fold Cross Validation နဲ့ စစ်မယ်

print("\n=== Cross Validation (5-Fold) ===")
cv_scores = cross_val_score(rf_model, X_scaled, y, cv=5, scoring='accuracy', n_jobs=-1)
print(f"CV Scores: {cv_scores}")
print(f"Mean CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

# %% [10] Final Model Training
# Full training data နဲ့ ပြန် train လုပ်မယ် (submission အတွက်)

print("\n=== Training Final Model on Full Data ===")

final_model = RandomForestClassifier(
    n_estimators=300,       # Tree ပိုသုံးမယ်
    max_depth=None,         # Depth limit မထားဘူး
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# Full data နဲ့ train လုပ်မယ်
final_model.fit(X_scaled, y)

# %% [11] Feature Importance
# ဘယ် feature တွေက အရေးကြီးလဲ ကြည့်မယ်

print("\n=== Top 10 Important Features ===")
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(10))

# %% [12] Make Predictions on Test Data
# Test data ကို predict လုပ်မယ်

print("\n=== Making Predictions on Test Data ===")
test_predictions = final_model.predict(X_test_scaled)

print(f"Predictions shape: {test_predictions.shape}")
print(f"Prediction distribution:\n{pd.Series(test_predictions).value_counts().sort_index()}")

# %% [13] Create Submission File
# Kaggle submission format: Id, Cover_Type

submission = pd.DataFrame({
    'Id': test_ids,
    'Cover_Type': test_predictions
})

# CSV file သိမ်းမယ်
submission.to_csv('submission.csv', index=False)

print("\n=== Submission File Created ===")
print(submission.head(10))
print(f"\nSubmission shape: {submission.shape}")
print("\n✅ 'submission.csv' file ကို Kaggle မှာ submit လုပ်ပါ!")

# %% [14] Optional: Gradient Boosting Model (နောက်ထပ် စမ်းကြည့်ဖို့)
# Uncomment လုပ်ပြီး စမ်းကြည့်နိုင်ပါတယ်

"""
print("\n=== Training Gradient Boosting ===")

gb_model = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    random_state=42,
    verbose=1
)

gb_model.fit(X_train, y_train)
y_val_pred_gb = gb_model.predict(X_val)
gb_accuracy = accuracy_score(y_val, y_val_pred_gb)
print(f"Gradient Boosting Validation Accuracy: {gb_accuracy:.4f}")
"""