In [246]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
import lightgbm as lgb
import pickle

In [247]:
train_df = pd.read_csv('./Train.csv')
test_df = pd.read_csv('./Test.csv')

In [248]:
if 'Place_ID' in train_df.columns and 'Date' in train_df.columns:
    train_df = train_df.drop(['Place_ID', 'Date'], axis=1)

In [249]:
if 'Place_ID X Date' in train_df.columns:
    train_df[['Place_ID', 'Date']] = train_df['Place_ID X Date'].str.split(' X ', expand=True)
    idx = train_df.columns.get_loc('Place_ID X Date')
    train_df = train_df.drop('Place_ID X Date', axis=1)
    train_df.insert(idx, 'Date', train_df.pop('Date'))
    train_df.insert(idx, 'Place_ID', train_df.pop('Place_ID'))
    train_df['Date'] = pd.to_datetime(train_df['Date'])
    train_df['Date'] = train_df['Date'].dt.strftime('%m/%d/%y')

In [250]:
place_groups = train_df.groupby('Place_ID')
numeric_cols = train_df.select_dtypes(include=np.number).columns
train_df[numeric_cols] = place_groups[numeric_cols].transform(lambda x: x.fillna(x.median()))
train_df = train_df.fillna(method='ffill')

In [251]:
X_train = train_df.drop(['Place_ID', 'Date', 'target', 'target_min', 'target_max', 'target_variance', 'target_count'], axis=1)
y_train = train_df['target']

In [252]:
selector = SelectKBest(f_regression, k='all')
X_train_selected = selector.fit_transform(X_train, y_train)
mask = selector.get_support()
X_train_selected_names = X_train.columns[mask]

In [253]:
X_trains, X_tests, y_trains, y_tests = train_test_split(X_train_selected, y_train, test_size=0.2, random_state=42)

In [254]:
lgb_trains = lgb.Dataset(X_trains, y_trains)
lgb_evals = lgb.Dataset(X_tests, y_tests, reference=lgb_trains)


In [255]:
params = {
    'boosting_type': 'dart',
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'num_iterations': 50000,
    'num_leaves': 50,
    'max_depth': 7,
    'min_data_in_leaf': 5,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'drop_rate': 0.1,
    'max_drop': 50,
    'skip_drop': 0.5,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'verbose': -1,
    'random_seed': 42
}

In [256]:
# params = {
#     'objective': 'regression',
#     'metric': 'rmse',
#     'boosting_type': 'dart',
#     'learning_rate': 0.1,
#     'num_iterations': 30000,
#     'num_leaves': 50,
#     'max_depth': 7,
#     'min_data_in_leaf': 5,
#     'feature_fraction': 0.7,
#     'bagging_fraction': 0.8,
#     'bagging_freq': 5,
#     'lambda_l1': 1,
#     'lambda_l2': 1,
#     'verbose': -1,
#     'random_seed': 42,
#     'device': 'gpu'
# }

In [257]:
model = lgb.train(params, train_set = lgb_trains, num_boost_round=10000,early_stopping_rounds=500,verbose_eval=500, valid_sets=lgb_evals)




[500]	valid_0's rmse: 39.9067
[1000]	valid_0's rmse: 32.0701
[1500]	valid_0's rmse: 29.9645
[2000]	valid_0's rmse: 28.5775
[2500]	valid_0's rmse: 28.0645
[3000]	valid_0's rmse: 27.4797


KeyboardInterrupt: 

In [None]:
place_groups = test_df.groupby('Place_ID')
numeric_cols = test_df.select_dtypes(include=np.number).columns
test_df[numeric_cols] = place_groups[numeric_cols].transform(lambda x: x.fillna(x.median()))
test_df = test_df.fillna(method='ffill')

In [None]:
X_test = test_df.drop(['Place_ID X Date', 'Place_ID', 'Date'], axis=1)

In [None]:
test_pred = model.predict(X_test)

In [None]:
output_df = pd.DataFrame({
    'Place_ID X Date': test_df['Place_ID X Date'],
    'target': test_pred
})
output_df.to_csv('predictions.csv', index=False)