In [1]:
import pandas as pd
import numpy as np

# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Print column names
print("Train Data Columns:", train_df.columns.tolist())
print("Test Data Columns:", test_df.columns.tolist())

Train Data Columns: ['ID', 'Year', 'Month', 'Day', 'kingdom', 'latitude', 'longitude', 'Avg_Temperature', 'Avg_Feels_Like_Temperature', 'Temperature_Range', 'Feels_Like_Temperature_Range', 'Radiation', 'Rain_Amount', 'Rain_Duration', 'Wind_Speed', 'Wind_Direction', 'Evapotranspiration']
Test Data Columns: ['ID', 'Year', 'Month', 'Day', 'kingdom']


In [2]:
import pandas as pd
import numpy as np

# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Date preprocessing
base_year = 2015
for df in [train_df, test_df]:
    if df['Year'].max() < 100:
        df['Year'] = df['Year'] + base_year
    df['date'] = pd.to_datetime(df[['Year', 'Month', 'Day']], errors='coerce')
    df.drop(['Year', 'Month', 'Day'], axis=1, inplace=True)

# Temperature conversion (only for train_df)
def convert_temp(df, col):
    if col in df.columns:
        df[col + '_C'] = df[col].apply(lambda x: x - 273.15 if x > 100 else x)
    return df

train_df = convert_temp(train_df, 'Avg_Temperature')
train_df = convert_temp(train_df, 'Avg_Feels_Like_Temperature')
train_df.drop(['Avg_Temperature', 'Avg_Feels_Like_Temperature'], axis=1, inplace=True, errors='ignore')

# Fill missing values
train_df = train_df.groupby('kingdom').apply(lambda x: x.fillna(method='ffill').fillna(method='bfill')).reset_index(drop=True)
test_df = test_df.groupby('kingdom').apply(lambda x: x.fillna(method='ffill').fillna(method='bfill')).reset_index(drop=True)

# Sort by date and kingdom
train_df = train_df.sort_values(['kingdom', 'date'])
test_df = test_df.sort_values(['kingdom', 'date'])


  train_df = train_df.groupby('kingdom').apply(lambda x: x.fillna(method='ffill').fillna(method='bfill')).reset_index(drop=True)
  test_df = test_df.groupby('kingdom').apply(lambda x: x.fillna(method='ffill').fillna(method='bfill')).reset_index(drop=True)


In [3]:
# Add basic features
for df in [train_df, test_df]:
    df['month'] = df['date'].dt.month
    df['day_of_year'] = df['date'].dt.dayofyear

# Define targets (only present in train_df)
targets = ['Avg_Temperature_C', 'Radiation', 'Rain_Amount', 'Wind_Speed', 'Wind_Direction']

# Lags and moving averages for train_df only
for target in targets:
    if target in train_df.columns:
        train_df[f'{target}_lag1'] = train_df.groupby('kingdom')[target].shift(1)
        train_df[f'{target}_ma7'] = train_df.groupby('kingdom')[target].rolling(7, min_periods=1).mean().reset_index(drop=True)

# Wind Direction as sine/cosine (only for train_df)
if 'Wind_Direction' in train_df.columns:
    train_df['wind_dir_sin'] = np.sin(np.radians(train_df['Wind_Direction']))
    train_df['wind_dir_cos'] = np.cos(np.radians(train_df['Wind_Direction']))

# Carry last values to test_df (for features, not targets)
last_values = train_df.groupby('kingdom').last()
for col in ['latitude', 'longitude'] + [f'{t}_lag1' for t in targets] + [f'{t}_ma7' for t in targets]:
    if col not in test_df.columns and col in train_df.columns:
        test_df[col] = test_df['kingdom'].map(last_values[col])

# Check columns again
print("Train Data Columns After Preprocessing:", train_df.columns.tolist())
print("Test Data Columns After Preprocessing:", test_df.columns.tolist())

Train Data Columns After Preprocessing: ['ID', 'kingdom', 'latitude', 'longitude', 'Temperature_Range', 'Feels_Like_Temperature_Range', 'Radiation', 'Rain_Amount', 'Rain_Duration', 'Wind_Speed', 'Wind_Direction', 'Evapotranspiration', 'date', 'Avg_Temperature_C', 'Avg_Feels_Like_Temperature_C', 'month', 'day_of_year', 'Avg_Temperature_C_lag1', 'Avg_Temperature_C_ma7', 'Radiation_lag1', 'Radiation_ma7', 'Rain_Amount_lag1', 'Rain_Amount_ma7', 'Wind_Speed_lag1', 'Wind_Speed_ma7', 'Wind_Direction_lag1', 'Wind_Direction_ma7', 'wind_dir_sin', 'wind_dir_cos']
Test Data Columns After Preprocessing: ['ID', 'kingdom', 'date', 'month', 'day_of_year', 'latitude', 'longitude', 'Avg_Temperature_C_lag1', 'Radiation_lag1', 'Rain_Amount_lag1', 'Wind_Speed_lag1', 'Wind_Direction_lag1', 'Avg_Temperature_C_ma7', 'Radiation_ma7', 'Rain_Amount_ma7', 'Wind_Speed_ma7', 'Wind_Direction_ma7']


In [4]:
from prophet import Prophet

submission = test_df[['ID']].copy()
for target in targets:
    preds = []
    for kingdom in train_df['kingdom'].unique():
        train_series = train_df[train_df['kingdom'] == kingdom][['date', target]].rename(columns={'date': 'ds', target: 'y'})
        test_dates = test_df[test_df['kingdom'] == kingdom][['date']].rename(columns={'date': 'ds'})
        model = Prophet(yearly_seasonality=True, changepoint_prior_scale=0.05)  # Tuned parameter
        model.fit(train_series)
        pred = model.predict(test_dates)['yhat']
        preds.extend(pred)
    submission[target.replace('_C', '')] = preds

submission.to_csv('submission_prophet.csv', index=False)

17:45:50 - cmdstanpy - INFO - Chain [1] start processing
17:45:51 - cmdstanpy - INFO - Chain [1] done processing
17:45:51 - cmdstanpy - INFO - Chain [1] start processing
17:45:51 - cmdstanpy - INFO - Chain [1] done processing
17:45:51 - cmdstanpy - INFO - Chain [1] start processing
17:45:52 - cmdstanpy - INFO - Chain [1] done processing
17:45:52 - cmdstanpy - INFO - Chain [1] start processing
17:45:52 - cmdstanpy - INFO - Chain [1] done processing
17:45:52 - cmdstanpy - INFO - Chain [1] start processing
17:45:53 - cmdstanpy - INFO - Chain [1] done processing
17:45:53 - cmdstanpy - INFO - Chain [1] start processing
17:45:53 - cmdstanpy - INFO - Chain [1] done processing
17:45:53 - cmdstanpy - INFO - Chain [1] start processing
17:45:54 - cmdstanpy - INFO - Chain [1] done processing
17:45:54 - cmdstanpy - INFO - Chain [1] start processing
17:45:54 - cmdstanpy - INFO - Chain [1] done processing
17:45:54 - cmdstanpy - INFO - Chain [1] start processing
17:45:55 - cmdstanpy - INFO - Chain [1]