In [None]:
import os 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score

import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.preprocessing import image

# Load our data and do a little cleaning
planttraits2024 load 
删除超过前 98% 的数据

In [None]:
train = pd.read_csv('/kaggle/input/planttraits2024/train.csv')
test = pd.read_csv('/kaggle/input/planttraits2024/test.csv')

# not worring about '_sd' columns for now
sd_columns = [col for col in train.columns if col.endswith('_sd')]
train = train.drop(columns=sd_columns)

#target columns
mean_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']


for column in mean_columns:
    upper_quantile = train[column].quantile(0.98)  
    train = train[(train[column] < upper_quantile)]

# Load up ImageNet

In [None]:
image_model = EfficientNetB3(weights='imagenet', include_top=False, pooling='avg')

#input resolution for the ImageNet
image_model_x = 300
image_model_y = 300

# Functions to use EfficientNet to extract image features into tabular data
## Images are turned into 1280 columns of tabular data!

In [None]:
# Define the function to create a TensorFlow dataset for images
def create_dataset(image_paths, batch_size=128):
    def process_path(file_path):
        img = tf.io.read_file(file_path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, [image_model_x, image_model_y])    #modify this to fit what your ImageNet model expects
        img = preprocess_input(img)
        return img
    path_ds = tf.data.Dataset.from_tensor_slices(image_paths)
    image_ds = path_ds.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)
    image_ds = image_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return image_ds

def extract_features_with_dataset(dataset, df):
    features_list = []
    for batch_imgs in dataset:
        print(".", end="")  # Print progress
        features = image_model.predict(batch_imgs, verbose=0)
        features_list.extend(features)
    features_array = np.array(features_list)
    
    # Convert the features array into a DataFrame
    features_df = pd.DataFrame(features_array)
    
    features_df.columns = [f'feature_{i}' for i in range(features_array.shape[1])]
    
    new_df = pd.concat([df.reset_index(drop=True), features_df.reset_index(drop=True)], axis=1)
    
    return new_df

# Extract image data for train

In [None]:
train_image_folder = '/kaggle/input/planttraits2024/train_images'

image_paths = [os.path.join(train_image_folder, f"{img_id}.jpeg") for img_id in train['id']]

# Create the dataset
image_dataset = create_dataset(image_paths)

# Extract features and directly insert them into the DataFrame as separate columns
train = extract_features_with_dataset(image_dataset, train)

print(train.head())

# Cross validate / train on tabular data

In [None]:
import joblib

In [None]:

#do cross-validation testing (this is relatively slow)
do_cv = True

X_full = train.drop(columns=mean_columns)
X_full = X_full.loc[:,~X_full.columns.duplicated()]
Y_full = train[mean_columns]

# 모델 불러오기
loaded_models = {}
for column in Y_full.columns:
    loaded_models[column] = joblib.load(f"/kaggle/input/x4_mean_model/tensorflow2/x4_mean_model/1/X4_mean_model.pkl")

models = {}

for column in Y_full.columns:
    if column in loaded_models:
        # If the model is loaded, use the loaded model
        model = loaded_models[column]
        print(f"Continuing training for {column}...")
    else:
        # Otherwise, create a new model
        model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=200, learning_rate=0.05, max_depth=10)
        print(f"Training new model for {column}...")  

    if do_cv: 
        print(f"\nDoing cross-validation scoring for {column}...")
        scores = cross_val_score(model, X_full, Y_full[column],
                                 cv=KFold(n_splits=3, shuffle=True, random_state=42),
                                 scoring='r2')        
        print(f"R^2 score for {column}: {np.mean(scores)}")
    
    #train model with all data
    print(f"Training model for {column}...")
    model.fit(X_full, Y_full[column])
    models[column] = model
    
    # save model
    for column, model in models.items():
        joblib.dump(model, f"{column}_model.pkl")
# save model
    
for column, model in models.items():
    joblib.dump(model, f"{column}_all_model.pkl")

In [None]:
# #do cross-validation testing (this is relatively slow)
# do_cv = True

# X_full = train.drop(columns=mean_columns)
# X_full = X_full.loc[:,~X_full.columns.duplicated()]
# Y_full = train[mean_columns]

# models = {}

# for column in Y_full.columns:

#     model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=200, learning_rate=0.05, max_depth=10)

#     if do_cv: 
#         print(f"\nDoing cross-validation scoring for {column}...")
#         scores = cross_val_score(model, X_full, Y_full[column],
#                                  cv=KFold(n_splits=3, shuffle=True, random_state=42),
#                                  scoring='r2')        
#         print(f"R^2 score for {column}: {np.mean(scores)}")
    
#     #train model with all data
#     print(f"Training model for {column}...")
#     model.fit(X_full, Y_full[column])
#     models[column] = model
    
#     # 모델 저장
#     for column, model in models.items():
#         joblib.dump(model, f"{column}_model.pkl")
    
# # # 모델 저장
# # for column, model in models.items():
# #     joblib.dump(model, f"{column}_model.pkl")

In [None]:
models.items()

In [None]:
!zip x.zip ./*

In [None]:
#该代码绘制了每列（column）预测值与实际值之间的散点图。 如果该模型使用多个特征进行预测，则实际值与预测值之间的线性关系将呈现可视化的散点图。import matplotlib.pyplot as plt

for column in models:
    model = models[column]
    predictions = model.predict(X_full)

    plt.figure(figsize=(8, 6)) 
    plt.scatter(Y_full[column], predictions)
    plt.title(f"Actual vs Predicted for {column}")
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.show()


In [None]:
#残差图(residual plot):表示所有预测值与实际值之差的残差用图表示。 这有助于从视觉上把握模型预测好的领域和不准确的领域。for column in models:
    model = models[column]
    predictions = model.predict(X_full)
    residuals = Y_full[column] - predictions
    plt.figure(figsize=(8, 6))
    plt.scatter(predictions, residuals)
    plt.axhline(y=0, color='r', linestyle='-')
    plt.title(f"Residual Plot for {column}")
    plt.xlabel("Predicted values")
    plt.ylabel("Residuals")
    plt.show()

# Fill in submit DF with mean of train values by default
### Provides near-0 R2 score values for any columns we don't predict for
默认情况下，使用列车值的平均值创建提交DF。

为不可预测的列提供几乎为零的R2评分值。

In [None]:
mean_values = Y_full.mean()
submission = pd.DataFrame({'id': test['id']})
submission[Y_full.columns] = mean_values

#rename from _mean
submission.columns = submission.columns.str.replace('_mean', '')
submission.head()

# Extract image data for test

In [None]:
test_image_folder = '/kaggle/input/planttraits2024/test_images'

image_paths = [os.path.join(test_image_folder, f"{img_id}.jpeg") for img_id in test['id']]

# Create the dataset
image_dataset = create_dataset(image_paths)

# Extract features and directly insert them into the DataFrame as separate columns
test = extract_features_with_dataset(image_dataset, test)

test.head()

# Predictions for test
## R2 scores look good for all targets - so we predict on everything...

In [None]:
submission['X4'] = models['X4_mean'].predict(test)
submission['X11'] = models['X11_mean'].predict(test)
submission['X18'] = models['X18_mean'].predict(test)
submission['X50'] = models['X50_mean'].predict(test)
submission['X26'] = models['X26_mean'].predict(test)
submission['X3112'] = models['X3112_mean'].predict(test)

submission.head()

# Submit!

In [None]:
submission.to_csv('submission.csv', index=False)