2024.10.15 使用Numpy2.0会报错，降级使用Numpy1.24.0可以正常使用

# Child Mind Institute | SIngleLGBM

In [1]:
import numpy as np
import polars as pl
import pandas as pd
from sklearn.base import clone
from copy import deepcopy
import optuna
from scipy.optimize import minimize
import os
import matplotlib.pyplot as plt
import seaborn as sns

import re
from colorama import Fore, Style

from tqdm import tqdm
from IPython.display import clear_output
from concurrent.futures import ThreadPoolExecutor

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

import lightgbm as lgb
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import *
from sklearn.metrics import *

SEED = 42
n_splits = 5

  from .autonotebook import tqdm as notebook_tqdm


# Basic Process

In [2]:
%%time

def process_file(filename, dirname):
    # 使用 os.path.join 函数将目录名、文件名和文件中的 ‘part-0.parquet’ 部分合并，形成一个完整的文件路径。
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet')) 
    # 从DataFrame df 中删除名为 ‘step’ 的列。axis=1 表示操作是在列的方向上进行的，inplace=True 表示在原DataFrame上进行修改，而不是返回一个新的DataFrame。
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]
    """
    .values.reshape(-1)：将描述性统计的结果转换为一个一维数组。
    filename.split('=')[1]：从文件名中提取等号(‘=’)后的部分。这假设文件名格式包含一个等号，并且您想要获取等号后面的部分。
    return df.describe().values.reshape(-1), filename.split('=')[1]：函数返回一个元组，包含描述性统计的一维数组和文件名中特定部分的字符串。
    """

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname) # 使用 os.listdir 函数获取指定目录 dirname 中的所有文件和目录的名称列表。
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    """
    executor.map(lambda fname: process_file(fname, dirname), ids)：使用 executor.map 方法并行地应用 process_file 函数到 ids 列表中的每个元素上。
    这里 lambda 函数是匿名函数，用于将 process_file 函数与参数 dirname 绑定，并将文件名 fname 作为参数传递。
    tqdm(..., total=len(ids))：使用 tqdm 包裹 executor.map 调用，显示进度条，其中 total 参数指定了任务的总数，这里是 ids 列表的长度。
    """
    
    stats, indexes = zip(*results)
    """
    results 是一个包含元组的列表，每个元组包含来自 process_file 函数的两个返回值。
    使用 zip(*results) 来解包这个列表，并将结果分别赋值给 stats 和 indexes 两个变量。
    """
    df = pd.DataFrame(stats, columns=[f"Stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    
    return df

train = pd.read_csv('child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('child-mind-institute-problematic-internet-use/sample_submission.csv')

train_ts = load_time_series("child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("child-mind-institute-problematic-internet-use/series_test.parquet")
time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

100%|██████████| 996/996 [01:23<00:00, 11.96it/s]
100%|██████████| 2/2 [00:00<00:00,  7.19it/s]

CPU times: user 5min 2s, sys: 58.1 s, total: 6min
Wall time: 1min 23s





In [3]:
featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

featuresCols += time_series_cols 
# 将名为 time_series_cols 的变量添加到 featuresCols 变量的末尾

train = train[featuresCols]
train = train.dropna(subset='sii')

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 
          'FGC-Season', 'BIA-Season', 'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

def update(df):
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df
        
train = update(train)
test = update(test)

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

"""This Mapping Works Fine For me I also Check Each Values in Train and test Using Logic. There no Data Lekage."""

for col in cat_c:
    mapping_train = create_mapping(col, train)
    mapping_test = create_mapping(col, test)
    
    train[col] = train[col].replace(mapping_train).astype(int)
    test[col] = test[col].replace(mapping_test).astype(int)

print(f'Train Shape : {train.shape} || Test Shape : {test.shape}')

Train Shape : (2736, 155) || Test Shape : (20, 154)
