In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Set random seed for reproducibility
np.random.seed(322)

# Load data
train = pd.read_csv('train.csv', parse_dates=['dt'])
test = pd.read_csv('test.csv', parse_dates=['dt'])

# Assume test has 'row_id' for submission
# Sort train by date
train.sort_values('dt', inplace=True)

# Define features
cat_features = ['product_id', 'management_group_id', 'first_category_id', 'second_category_id', 'third_category_id',
                'dow', 'month', 'holiday_flag', 'activity_flag']
num_features = ['day_of_month', 'week_of_year', 'n_stores', 'precpt', 'avg_temperature', 'avg_humidity', 'avg_wind_level']
weather_features = ['precpt', 'avg_temperature', 'avg_humidity', 'avg_wind_level']
targets = ['price_p05', 'price_p95']

# Anomaly detection to remove outliers
iso = IsolationForest(random_state=322)
train['anomaly'] = iso.fit_predict(train[num_features + targets])
train = train[train['anomaly'] == 1].drop('anomaly', axis=1)

# Dimensionality reduction with PCA on weather features
pca = PCA(n_components=2, random_state=322)
weather_pca = pca.fit_transform(train[weather_features])
train['pca1'] = weather_pca[:, 0]
train['pca2'] = weather_pca[:, 1]
weather_pca_test = pca.transform(test[weather_features])
test['pca1'] = weather_pca_test[:, 0]
test['pca2'] = weather_pca_test[:, 1]
num_features += ['pca1', 'pca2']

# Clustering with KMeans on numerical features
kmeans = KMeans(n_clusters=10, random_state=322)
train['cluster'] = kmeans.fit_predict(train[num_features])
test['cluster'] = kmeans.predict(test[num_features])
cat_features += ['cluster']

# Prepare data
X = train[cat_features + num_features]
y1 = train['price_p05']
y2 = train['price_p95']
X_test = test[cat_features + num_features]

# Convert categoricals to category type for LightGBM
for col in cat_features:
    X[col] = X[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# LightGBM parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'random_state': 322,
    'n_jobs': -1,
    'verbose': -1
}

# Train model for price_p05
dtrain1 = lgb.Dataset(X, y1, categorical_feature=cat_features)
model1 = lgb.train(params, dtrain1, num_boost_round=500)

# Train model for price_p95
dtrain2 = lgb.Dataset(X, y2, categorical_feature=cat_features)
model2 = lgb.train(params, dtrain2, num_boost_round=500)

# Predict on test
pred1 = model1.predict(X_test)
pred2 = model2.predict(X_test)

# Create submission
submission = pd.DataFrame({
    'row_id': test['row_id'],
    'price_p05': pred1,
    'price_p95': pred2
})
submission.to_csv('submission.csv', index=False)

print("Submission file created: submission.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'