In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# Load the data
data = pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable/historical_weather.csv')

# Convert 'date' column to datetime
data['date'] = pd.to_datetime(data['date'])

# Ensure 'city_id' is treated as a category
data['city_id'] = data['city_id'].astype('category')

# Handle missing values
data = data.dropna(subset=['avg_temp_c', 'min_temp_c', 'max_temp_c'])

# IQR for avg_temp_c
Q1 = data['avg_temp_c'].quantile(0.25)
Q3 = data['avg_temp_c'].quantile(0.75)
IQR = Q3 - Q1
l_bound = Q1 - 1.5 * IQR
u_bound = Q3 + 1.5 * IQR
data = data[data['avg_temp_c'].between(l_bound, u_bound)]

# IQR for max_temp_c
Q1 = data['max_temp_c'].quantile(0.25)
Q3 = data['max_temp_c'].quantile(0.75)
IQR = Q3 - Q1
l_bound = Q1 - 1.5 * IQR
u_bound = Q3 + 1.5 * IQR
data = data[data['max_temp_c'].between(l_bound, u_bound)]

# IQR for min_temp_c
Q1 = data['min_temp_c'].quantile(0.25)
Q3 = data['min_temp_c'].quantile(0.75)
IQR = Q3 - Q1
l_bound = Q1 - 1.5 * IQR
u_bound = Q3 + 1.5 * IQR
data = data[data['min_temp_c'].between(l_bound, u_bound)]

# Log transformation
data['log_transformed_at'] = np.log1p(data['avg_temp_c'])

data = data[np.isfinite(data['log_transformed_at'])]
data = data.dropna(subset=['log_transformed_at'])

# Feature Engineering
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day
data['day_of_week'] = data['date'].dt.dayofweek

# Create lag features for avg_temp_c
for lag in range(1, 8):
    data[f'avg_temp_lag_{lag}'] = data.groupby('city_id')['log_transformed_at'].shift(lag)
    
data = data.dropna(subset=[f'avg_temp_lag_{i}' for i in range(1, 8)])

# Extract unique city IDs before one-hot encoding
unique_city_ids = data['city_id'].cat.categories[:100]  

# One-hot encode 'city_id'
data = pd.get_dummies(data, columns=['city_id'], drop_first=False)

# Train-test split
train_data = data[data['date'] < '2018-12-25']

# Initialize and train the model
X_train = train_data.drop(['date', 'avg_temp_c', 'precipitation_mm', 'log_transformed_at', 'snow_depth_mm', 'avg_wind_dir_deg', 'avg_wind_speed_kmh'], axis=1)
y_train = train_data['log_transformed_at']

model = LinearRegression()
model.fit(X_train, y_train)

# Predict avg_temp_c for the first week of 2019 for the first 8 cities
first_week_2019_dates = pd.date_range(start='2019-01-01', end='2019-01-07')

# Create a DataFrame to store predictions
all_predictions = []

# Define the columns for one-hot encoded city IDs
city_columns = [col for col in data.columns if col.startswith('city_id_')]

for city_id in unique_city_ids:
    # Filter last available week for the given city_id
    city_data = data[data[f'city_id_{city_id}'] == 1].iloc[-7:]
    
    # If there's not enough data, skip this city
    if city_data.shape[0] < 7:
        continue

    # Create the new dataset for prediction
    new_data = pd.DataFrame({
        'date': first_week_2019_dates,
        'year': first_week_2019_dates.year,
        'month': first_week_2019_dates.month,
        'day': first_week_2019_dates.day,
        'day_of_week': first_week_2019_dates.dayofweek,
    })

    # Add lag features (using the last available data for the given city_id)
    for lag in range(1, 8):
        new_data[f'avg_temp_lag_{lag}'] = city_data['log_transformed_at'].values[-7 + lag - 1]

    # Add one-hot encoded city_id columns
    city_dummy = {f'city_id_{city_id}': 1}
    for col in city_columns:
        if col not in city_dummy:
            city_dummy[col] = 0
    for col in city_dummy:
        new_data[col] = city_dummy[col]

    # Reindex to ensure all columns are present
    new_data = new_data.reindex(columns=X_train.columns, fill_value=0)

    # Make predictions
    new_predictions = model.predict(new_data)

    # Collect predictions
    for date, pred in zip(first_week_2019_dates, new_predictions):
        all_predictions.append({'city_id': city_id, 'date': date, 'predicted_log_transformed_at': pred})

# Create a DataFrame with all predictions
predicted_data = pd.DataFrame(all_predictions)

# Reverse the log transformation to get real predicted avg_temp_c values
predicted_data['predicted_avg_temp_c'] = np.expm1(predicted_data['predicted_log_transformed_at'])

# Load the sample submission file
test = pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable/sample_submission.csv')

# Match the sample submission with the predictions
predictions = predicted_data
submission = test[["submission_ID"]]

# Assign the real predicted temperatures to the 'avg_temp_c' column of submission_sample
submission["avg_temp_c"] = predictions['predicted_avg_temp_c'].values

# Save to CSV
submission.to_csv("submission.csv", index=None)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  data[f'avg_temp_lag_{lag}'] = data.groupby('city_id')['log_transformed_at'].shift(lag)
  data[f'avg_temp_lag_{lag}'] = data.groupby('city_id')['log_transformed_at'].shift(lag)
  data[f'avg_temp_lag_{lag}'] = data.groupby('city_id')['log_transformed_at'].shift(lag)
  data[f'avg_temp_lag_{lag}'] = data.groupby('city_id')['log_transformed_at'].shift(lag)
  data[f'avg_temp_lag_{lag}'] = data.groupby('city_id')['log_transformed_at'].shift(lag)
  data[f'avg_temp_lag_{lag}'] = data.groupby('city_id')['log_transformed_at'].shift(lag)
  data[f'avg_temp_lag_{lag}'] = data.groupby('city_id')['log_transformed_at'].shift(lag)
  new_data[col] = city_dummy[col]
  new_data[col] = city_dummy[col]
  new_data[col] = city_dummy[col]
  new_data[col] = city_dummy[col]
  new_data[col] = city_dummy[col]
  new_data[col] = city_dummy[col]
  new_data[col] = city_dummy[col]
  new_data[col] = city_dummy[col]