<a href="https://colab.research.google.com/github/Charee-Villapong/forest-type-classification/blob/main/%E6%9E%97%E5%9E%8B%E5%88%86%E9%A1%9E.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Google Driveと接続

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cdist
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, classification_report

#データ型変更用関数
def change_data_type(data_type:str, col_name:str,df:pd.DataFrame):
    df[col_name] = df[col_name].astype(data_type)
    return df

#ギャップ統計量算出用関数
def gap_statistic(data, n_refs=10, max_clusters=10):
    """
    ギャップ統計量を計算して最適なクラスタ数を見つける関数
    Args:
        data: ndarray, クラスタリング対象のデータ
        n_refs: int, 参照データのサンプル数
        max_clusters: int, 試行する最大クラスタ数
    Returns:
        gaps: list, ギャップ統計量
        optimal_k: int, 最適なクラスタ数
    """
    gaps = []
    results = []
    for k in range(1, max_clusters + 1):
        # 実データのSSE（クラスタ間距離の合計）
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(data)
        actual_dispersion = np.mean(np.min(cdist(data, kmeans.cluster_centers_, 'euclidean'), axis=1))

        # 参照データのSSE（ランダムデータで同様に計算）
        ref_disps = []
        for _ in range(n_refs):
            random_data = np.random.random_sample(size=data.shape)
            kmeans.fit(random_data)
            ref_dispersion = np.mean(np.min(cdist(random_data, kmeans.cluster_centers_, 'euclidean'), axis=1))
            ref_disps.append(ref_dispersion)

        # ギャップ統計量を計算
        gap = np.log(np.mean(ref_disps)) - np.log(actual_dispersion)
        gaps.append(gap)
        results.append((k, gap))

    # ギャップ統計量が最大となるクラスタ数を選択
    optimal_k = sorted(results, key=lambda x: x[1], reverse=True)[0][0]
    return gaps, optimal_k

def create_categorical_column(df, prefix, start, end):
    """
    指定された接頭辞を持つ列から新しいカテゴリカル列を作成

    :param df: 処理対象のDataFrame
    :param prefix: 対象列の接頭辞
    :param start: 列番号の開始値
    :param end: 列番号の終了値（この値を含む）
    :return: 新しい列が追加されたDataFrame
    """
    columns = [f'{prefix}{i}' for i in range(start, end + 1)]
    new_column_name = prefix.rstrip('_')
    df[new_column_name] = df[columns].idxmax(axis=1).str.extract('(\d+)') #複数列を1列に圧縮
    df[new_column_name] = pd.Categorical(df[new_column_name]) #カテゴリ変数化
    df.drop(columns=columns,inplace=True)
    return df

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
PATH = "/content/drive/MyDrive/2025/林形分類"
train_data = "/content/drive/MyDrive/2025/林形分類/train.tsv"
test_data = "/content/drive/MyDrive/2025/林形分類/test.tsv"

df = pd.read_csv(train_data, sep="\t").drop(axis =1,columns="Unnamed: 0")

y = df.Cover_Type
X = df.drop(axis=1,columns="Cover_Type")

In [None]:
#学習データ/validation/テストデータへの分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42) #valデータとtestデータへの分割

"""
学習データの作成
"""

#標準化
num_cols = X_train.select_dtypes(include=np.number).columns
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train.select_dtypes(include=np.number))
X_scaled = pd.DataFrame(X_scaled,columns=num_cols)

#Validationデータの標準化
X_val_scaled = scaler.transform(X_val.select_dtypes(include=np.number))
X_val_scaled = pd.DataFrame(X_val_scaled,columns=num_cols)

#最適な分割数の算出
_ , optimal_k = gap_statistic(X_scaled, n_refs=10, max_clusters=30)
print(f"最適なクラスタ数：{optimal_k}")

#K-meansによる特徴量生成
kmeans = KMeans(n_clusters=optimal_k, random_state=42)  # k平均法 のモデルを定義
X_train["k_pred"] = kmeans.fit_predict(X_scaled)
X_val["k_pred"] = kmeans.predict(X_val_scaled)

#カテゴリ型への変換(Train)
X_train["k_pred"] = pd.Categorical(X_train["k_pred"])

#カテゴリ型への変換(Validation)
X_val["k_pred"] = pd.Categorical(X_val["k_pred"])

#カテゴリカル関数化
create_categorical_column(X_train, 'Soil_Type', 1, 40)
create_categorical_column(X_train, 'Wilderness_Area', 1, 4)

create_categorical_column(X_val, 'Soil_Type', 1, 40)
create_categorical_column(X_val, 'Wilderness_Area', 1, 4)

###モデリング###
params = {
    'boosting_type': 'goss',
    'max_depth': 5,
    'random_state': 0,
    'learning_rate': 0.07,
    'objective': 'multiclass',
    'num_leaves': 63,
    'feature_fraction': 0.8,
    'num_class': 7,
    'metric': 'multi_logloss'
}


#LGBM用データセットの作成
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

#モデルの学習
model = lgb.LGBMClassifier(**params)
model.fit(X_train, y_train,
          eval_set=[(X_val, y_val)],
          callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=True)])

"""
推論結果の作成
"""
#標準化とK-meansによる特徴量生成
X_test_scaled = scaler.transform(X_test.select_dtypes(include=np.number))
X_test["k_pred"] = kmeans.predict(X_test_scaled)

#カテゴリ型への変換
X_test["k_pred"] = pd.Categorical(X_test["k_pred"])

#カテゴリカル化
create_categorical_column(X_test, 'Soil_Type', 1, 40)
create_categorical_column(X_test, 'Wilderness_Area', 1, 4)

y_proba = model.predict_proba(X_test)
y_pred = np.argmax(y_proba, axis=1) + 1

#モデルの結果検証
print(f"f1_score_macro: {f1_score(y_test, y_pred,average='macro')}")
print(f"f1_score_micro: {f1_score(y_test, y_pred,average='micro')}")
print(f"f1_score_weighted: {f1_score(y_test, y_pred,average='weighted')}")

conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

# 各クラスの精度、再現率、F1スコアを表示
report = classification_report(y_test, y_pred)
print(report)

最適なクラスタ数：30
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028499 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2252
[LightGBM] [Info] Number of data points in the train set: 194639, number of used features: 13
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score -1.010483
[LightGBM] [Info] Start training from score -0.716554
[LightGBM] [Info] Start training from score -2.785657
[LightGBM] [Info] Start training from score -5.313011
[LightGBM] [Info] Start training from score -4.141359
[LightGBM] [Info] Start training from score -3.512082
[LightGBM] [Info] Start training from score -3.347482
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's multi_logloss: 0.44494




f1_score_macro: 0.7437489193003579
f1_score_micro: 0.8089456335795051
f1_score_weighted: 0.8064466777510592
[[13676  3619     1     0     7     4   124]
 [ 3016 20023   142     3    34   135    17]
 [    0   204  2443    27     1   221     0]
 [    0     0    46   160     0     3     0]
 [   12   524    22     0   322     4     0]
 [    3   250   386    14     0   825     0]
 [  329     9     0     0     1     0  1327]]
              precision    recall  f1-score   support

           1       0.80      0.78      0.79     17431
           2       0.81      0.86      0.83     23370
           3       0.80      0.84      0.82      2896
           4       0.78      0.77      0.77       209
           5       0.88      0.36      0.52       884
           6       0.69      0.56      0.62      1478
           7       0.90      0.80      0.85      1666

    accuracy                           0.81     47934
   macro avg       0.81      0.71      0.74     47934
weighted avg       0.81      0.81 

In [None]:
#特徴量重要度の算出
importance = model.feature_importances_
for i, v in enumerate(importance):
    print(f"Feature {i}: {v}")


Feature 0: 3304
Feature 1: 1183
Feature 2: 728
Feature 3: 1672
Feature 4: 1577
Feature 5: 2965
Feature 6: 1158
Feature 7: 1149
Feature 8: 966
Feature 9: 2767
Feature 10: 861
Feature 11: 990
Feature 12: 357
