## __準備__

In [None]:
import time
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report

import lightgbm as lgb

In [None]:
# Google Driveのマウント
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DIR_NAME = '/content/drive/MyDrive/DataCompetition/SMBC/data/'

# データの読み込み
train = pd.read_csv(DIR_NAME + 'train.csv', index_col=0)
test = pd.read_csv(DIR_NAME + 'test.csv', index_col=0)
submit_sample = pd.read_csv(DIR_NAME + 'sample_submission.csv', index_col=0, header=None, names=['id', 'y'])

## __前処理__




In [None]:
# 木の高さの特徴量
over50 = ['Kentucky coffeetree', 'honeylocust', 'ginkgo', 'American beech', 'European beech', 'hardy rubber tree', 'Turkish hazelnut', 'katsura tree', 'common hackberry', 'river birch', 'Ohio buckeye', 'horse chestnut', 'red maple', 'London planetree', 'dawn redwood', 'sweetgum', 'shingle oak', 'white oak', 'swamp white oak', 'pin oak', 'bur oak', 'black oak', 'pin oak', 'scarlet oak', 'willow oak', 'English oak', "Schumard's oak", 'northern red oak', 'black oak', 'bald cypress', 'littleleaf linden', 'silver linden', 'American linden', 'American elm', 'Japanese zelkova']
over35_under50 = ['red horse chestnut', 'American hophornbeam', 'European hornbeam', 'Japanese hornbeam', 'Kentucky yellowwood', 'golden raintree', 'Amur maackia', 'Persian ironwood', 'hedge maple', ]
under35 = ['Amur maple', 'paperbark maple', 'eastern redbud', 'Oklahoma redbud', 'serviceberry', 'Chinese fringetree', 'flowering dogwood', 'kousa dogwood', 'pagoda dogwood', 'crab apple', 'cockspur hawthorn', 'hawthorn', 'Japanese tree lilac', 'Chinese tree lilac']

# 木の葉の形の特徴量
teardrop = ['Japanese zelkova', 'Callery pear', 'mulberry', 'silver birch', 'pagoda dogwood', 'Cornelian cherry', 'catalpa', 'American beech', 'quaking aspen', 'bigtooth aspen', 'cucumber magnolia', 'magnolia', 'southern magnolia']
spade = ["'Schubert' chokecherry", 'eastern redbud', 'Oklahoma redbud', 'katsura tree', 'paper birch', 'empress tree', 'eastern cottonwood']
uneven = ['littleleaf linden', 'silver linden', 'American linden', 'American elm', 'Chinese elm', 'Siberian elm', 'Japanese tree lilac', 'Chinese tree lilac']
oak = ['English oak', "Schumard's oak",'pin oak', 'bur oak', 'black oak', 'scarlet oak', 'swamp white oak', 'northern red oak', 'white oak', 'shingle oak']
broom = ['pitch pine', 'red pine', 'white pine', 'pine']
feather = ['dawn redwood', 'bald cypress', 'Norway spruce', 'blue spruce', 'spruce', 'eastern hemlock']
compoundhand = ['red horse chestnut', 'horse chestnut', 'Ohio buckeye', 'paperbark maple']
compoundline = ['honeylocust', 'green ash', 'ash', 'white ash', 'Sophora', 'Kentucky coffeetree', 'golden raintree', 'Amur maackia', 'tree of heaven', 'black locust', 'Kentucky yellowwood', 'black walnut', 'Amur cork tree', 'Amur maackia', 'pignut hickory']
scales = ['eastern redcedar', 'arborvitae', 'Atlantic white cedar', 'Himalayan cedar', 'Atlas cedar', 'bald cypress', 'pond cypress', 'false cypress']
football = ['cherry', 'black cherry', 'purple-leaf plum', 'American hornbeam', 'American hophornbeam', 'European hornbeam', 'Japanese hornbeam', 'crab apple', 'serviceberry', 'flowering dogwood', 'kousa dogwood', 'cucumber magnolia', 'magnolia', 'southern magnolia', 'sawtooth oak', 'willow oak', 'shingle oak', 'hardy rubber tree', 'Japanese snowbell', 'blackgum', 'Persian ironwood', 'crepe myrtle', 'Chinese fringetree', 'European beech', 'two-winged silverbell', 'holly', 'hawthorn', 'cockspur hawthorn']
hand = ['London planetree', 'crimson king maple', 'Amur maple', 'Norway maple', 'silver maple', 'maple', 'red maple', 'tartar maple', 'sycamore maple', 'hedge maple', 'Japanese maple', 'sugar maple', 'Shantung maple','paperbark maple', 'trident maple', 'tulip-poplar', 'sweetgum', 'hawthorn', 'cockspur hawthorn', 'ginkgo', 'Turkish hazelnut', 'sassafras']

# 特徴量作成のためのdict
spc_dict = {'over50':over50, 'over35_under50':over35_under50, 'under35':under35, 'teardrop':teardrop, 'spade':spade, 'uneven':uneven, 'oak':oak, 'broom':broom, 'feather':feather, 'compoundhand':compoundhand, 'compoundline':compoundline, 'scales':scales, 'football':football, 'hand':hand}


def preprocess_data(data):
    """
    Preprocesses the given dataset by removing specific columns, transforming date information,
    calculating tree diameter statistics, and encoding categorical variables.
    """

    # 不要な列を削除
    data = data.drop(['spc_latin', 'nta', 'nta_name', 'boroname', 'borocode', 'cb_num', 'st_senate', 'zip_city'], axis=1)

    # 月情報の抽出と季節の決定
    data['created_at'] = data['created_at'].apply(lambda x: x[5:7])
    data['created_at_int'] = data['created_at'].apply(int)
    data['season'] = data['created_at'].apply(get_season_from_month)

    # 種別ごとの直径統計の計算
    data = calculate_diameter_statistics(data)

    # 特定の樹木種のフラグ設定
    data = flag_specific_species(data)

    # 欠損値の処理
    data.fillna('other', inplace=True)

    # 'problems' 列の変換
    data['problems'] = data['problems'].apply(lambda x: 0 if x == 'other' else 1)

    return data

def get_season_from_month(month):
    """
    Returns the season number based on the month.
    """
    month = int(month)
    if month in [3, 4, 5]:
        return 1  # 春
    elif month in [6, 7, 8]:
        return 2  # 夏
    elif month in [9, 10, 11]:
        return 3  # 秋
    else:
        return 4  # 冬

def calculate_diameter_statistics(data):
    """
    Calculates average diameter, diameter difference, and diameter range for each tree species.
    """
    # 種別ごとの平均直径
    avg_diameter_per_species = data.groupby('spc_common')['tree_dbh'].mean()
    data['avg_diameter_species'] = data['spc_common'].map(avg_diameter_per_species)
    data['diameter_difference'] = data['avg_diameter_species'] - data['tree_dbh']

    # 種別ごとの直径範囲
    max_diameter_per_species = data.groupby('spc_common')['tree_dbh'].max()
    min_diameter_per_species = data.groupby('spc_common')['tree_dbh'].min()
    data['diameter_range_species'] = data['spc_common'].map(max_diameter_per_species) - data['spc_common'].map(min_diameter_per_species)

    return data

def flag_specific_species(data):
    """
    Flags rows if the tree species matches specific criteria.
    """
    for species_name, species_values in spc_dict.items():
        data[f'{species_name}'] = data['spc_common'].isin(species_values).astype(int)
    return data

def prepare_datasets(train, test):
    """
    Prepares training and testing datasets by preprocessing and splitting features and labels.
    """
    train_processed = preprocess_data(train)
    test_processed = preprocess_data(test)

    train_features = train_processed.drop(['health'], axis=1)
    train_labels = train_processed['health']

    # 特徴量の結合とダミー変数の生成
    combined_features = pd.concat([train_features, test_processed])
    combined_features = pd.get_dummies(combined_features)

    # トレーニングとテストデータセットの分割
    X_train = combined_features.iloc[:len(train_features),:].reset_index(drop=True)
    X_test = combined_features.iloc[len(train_features):,:].reset_index(drop=True)

    # クラス重みの計算
    class_weights = {1: 1, 0: 3.901, 2: 16}
    X_train['class_weight'] = train_labels.apply(lambda x: class_weights[x])

    return X_train, train_labels, X_test

X, y, X_test = prepare_datasets(train, test)

## __学習__



In [None]:
def train_model(X_train, y_train, X_valid, y_valid, params):
    """
    Train and evaluate a LightGBM model.
    """
    lgb_train = lgb.Dataset(
        X_train.drop("class_weight", axis=1),
        y_train,
        weight=X_train["class_weight"]
    )
    lgb_valid = lgb.Dataset(
        X_valid.drop("class_weight", axis=1),
        y_valid,
        reference=lgb_train,
        weight=X_valid["class_weight"]
    )

    result = {}
    model = lgb.train(
        params=params,
        train_set=lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=['Train', 'Valid'],
        num_boost_round=NUM_BOOST_ROUND,
        callbacks=[
            lgb.early_stopping(stopping_rounds=STOPPING_ROUNDS, verbose=False),
            lgb.callback.record_evaluation(result)
        ]
    )

    return model, result

def train_and_evaluate(X, y, random_seeds):
    valid_scores, results, models = [], [], []

    for random_seed in random_seeds:
        params = {
            'objective': 'multiclass',
            'num_class': 3,
            'boosting': 'gbdt',
            'metric': 'multi_logloss',
            'seed': random_seed,
            'learning_rate': 0.05,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 1,
            'verbose': -1,
            'max_depth': 5,
            'num_leaves': 20,
        }

        kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=random_seed) # (1)
        for fold, (train_indices, valid_indices) in enumerate(kf.split(X, y)):
            X_train, X_valid = X.iloc[train_indices].copy(), X.iloc[valid_indices].copy()
            y_train, y_valid = y.iloc[train_indices], y.iloc[valid_indices]

            model, result = train_model(X_train, y_train, X_valid, y_valid, params)

            y_valid_pred = model.predict(X_valid.drop("class_weight", axis=1))
            y_valid_pred = np.argmax(y_valid_pred, axis=1)
            score = f1_score(y_valid, y_valid_pred, average='macro')
            valid_scores.append(score)
            print(f'Fold: {fold + 1}  macrof1: {score}')

            report = classification_report(y_valid, y_valid_pred)
            print(report)

            results.append(result)
            models.append(model)

        cv_score = np.mean(valid_scores)
        print(f'CV: {cv_score}')

    return models, results, cv_score

SEED = 42
FOLDS = 5
STOPPING_ROUNDS = 10
NUM_BOOST_ROUND = 1000
random_seeds = [8, 42, 88, 123, 433]

models, results, cv_score = train_and_evaluate(X, y, random_seeds)
print(cv_score)

Fold: 1  macrof1: 0.3758619074537095
              precision    recall  f1-score   support

           0       0.24      0.24      0.24       707
           1       0.80      0.80      0.80      3151
           2       0.08      0.09      0.09       139

    accuracy                           0.68      3997
   macro avg       0.37      0.38      0.38      3997
weighted avg       0.68      0.68      0.68      3997

Fold: 2  macrof1: 0.3597918655145013
              precision    recall  f1-score   support

           0       0.22      0.23      0.23       707
           1       0.80      0.79      0.80      3150
           2       0.05      0.06      0.06       140

    accuracy                           0.66      3997
   macro avg       0.36      0.36      0.36      3997
weighted avg       0.67      0.66      0.67      3997

Fold: 3  macrof1: 0.3707919268543305
              precision    recall  f1-score   support

           0       0.24      0.23      0.23       707
           1      

In [None]:
# 特徴量の重要度
importance = pd.DataFrame(np.mean([models[i].feature_importance(importance_type='gain') for i in range(FOLDS)], axis=0))
importance.index = X.drop("class_weight", axis=1).columns
importance.columns = ['importance']
importance = importance.sort_values('importance', ascending=False)
importance[0:50]

Unnamed: 0,importance
boro_ct,5825.406812
tree_dbh,4705.821038
diameter_difference,4507.833843
avg_diameter_species,4093.846151
st_assem,3878.649927
diameter_range_species,3876.479859
steward_1or2,3307.542406
cncldist,2738.612903
created_at_int,2666.036326
uneven,1192.433064


## __予測__

In [None]:
def generate_predictions(models, X_test, test_indices):
    """
    Generate predictions from multiple models and determine the final prediction
    by taking the mode of all model predictions.

    Args:
    - models: List of trained models.
    - X_test: Test features.
    - test_indices: Index values for the test dataset.

    Returns:
    - DataFrame with indices and final predictions.
    """
    # 初期化
    predictions = pd.DataFrame()
    predictions['id'] = test_indices

    # 各モデルからの予測を取得
    for num, model in enumerate(models):
        model_predictions = model.predict(X_test)
        predictions[f'model{num}'] = np.argmax(model_predictions, axis=1)

    # 最終予測の決定
    predictions['pred'] = predictions.iloc[:, 1:].apply(lambda row: row.mode()[0], axis=1)

    return predictions[['id', 'pred']]


df = generate_predictions(models, X_test, test.index)
output_file = DIR_NAME + '../output/submission.csv'
df.to_csv(output_file, header=None, index=None)

In [None]:
df['pred'].value_counts()

1    15593
0     3294
2      815
Name: pred, dtype: int64