In [114]:
import pandas as pd
# Load the dataset
data = pd.read_csv('historical_weather.csv/historical_weather.csv')

# Display the first few rows of the dataset
data.head(20)

Unnamed: 0,city_id,date,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh
0,C001,2014-01-01,6.6,-1.4,11.6,,,168.0,6.2
1,C001,2014-01-02,9.3,6.3,13.3,,,155.0,10.0
2,C001,2014-01-03,7.6,1.9,14.0,,,,5.8
3,C001,2014-01-04,7.6,3.9,13.3,,,291.0,11.3
4,C001,2014-01-05,8.6,0.5,16.9,,,,5.0
5,C001,2014-01-06,9.1,5.7,14.7,,,156.0,5.7
6,C001,2014-01-07,8.8,2.0,18.8,,,158.0,6.2
7,C001,2014-01-08,8.8,3.0,16.3,,,,
8,C001,2014-01-09,9.9,3.9,17.1,,,,4.8
9,C001,2014-01-10,10.8,7.0,17.2,,,149.0,5.8


In [115]:
# Initialize an empty dictionary to store missing row counts per column
missing_rows_per_column = {}

# Iterate over each column
for column in data.columns:
    # Count the number of missing rows in the current column
    missing_rows_per_column[column] = data[column].isnull().sum()

print("Number of missing rows in each column:")
for column, count in missing_rows_per_column.items():
    print(f"{column}: {count}")

Number of missing rows in each column:
city_id: 0
date: 0
avg_temp_c: 1224
min_temp_c: 5886
max_temp_c: 7493
precipitation_mm: 69744
snow_depth_mm: 170100
avg_wind_dir_deg: 35394
avg_wind_speed_kmh: 22472


In [116]:
# Total number of rows in the DataFrame
total_rows = data.shape[0]

print(f"Total number of rows in the DataFrame: {total_rows}")

Total number of rows in the DataFrame: 182338


In [117]:
# Drop columns with high missing values
columns_to_drop = ['precipitation_mm', 'snow_depth_mm']
data.drop(columns=columns_to_drop, inplace=True)

# Verify the remaining columns
print(data.columns)

Index(['city_id', 'date', 'avg_temp_c', 'min_temp_c', 'max_temp_c',
       'avg_wind_dir_deg', 'avg_wind_speed_kmh'],
      dtype='object')


In [118]:
data.head()


Unnamed: 0,city_id,date,avg_temp_c,min_temp_c,max_temp_c,avg_wind_dir_deg,avg_wind_speed_kmh
0,C001,2014-01-01,6.6,-1.4,11.6,168.0,6.2
1,C001,2014-01-02,9.3,6.3,13.3,155.0,10.0
2,C001,2014-01-03,7.6,1.9,14.0,,5.8
3,C001,2014-01-04,7.6,3.9,13.3,291.0,11.3
4,C001,2014-01-05,8.6,0.5,16.9,,5.0


In [119]:
from sklearn.impute import SimpleImputer

# Specify columns to impute
columns_to_impute = ['avg_temp_c', 'min_temp_c', 'max_temp_c', 'avg_wind_dir_deg', 'avg_wind_speed_kmh']

# Initialize SimpleImputer with mean strategy
imputer = SimpleImputer(strategy='mean')

# Fit and transform the selected columns
data[columns_to_impute] = imputer.fit_transform(data[columns_to_impute])

# Verify if there are any remaining missing values
print("Number of missing values after imputation:")
print(data[columns_to_impute].isnull().sum())

Number of missing values after imputation:
avg_temp_c            0
min_temp_c            0
max_temp_c            0
avg_wind_dir_deg      0
avg_wind_speed_kmh    0
dtype: int64


In [120]:
data.head(20)

Unnamed: 0,city_id,date,avg_temp_c,min_temp_c,max_temp_c,avg_wind_dir_deg,avg_wind_speed_kmh
0,C001,2014-01-01,6.6,-1.4,11.6,168.0,6.2
1,C001,2014-01-02,9.3,6.3,13.3,155.0,10.0
2,C001,2014-01-03,7.6,1.9,14.0,175.650289,5.8
3,C001,2014-01-04,7.6,3.9,13.3,291.0,11.3
4,C001,2014-01-05,8.6,0.5,16.9,175.650289,5.0
5,C001,2014-01-06,9.1,5.7,14.7,156.0,5.7
6,C001,2014-01-07,8.8,2.0,18.8,158.0,6.2
7,C001,2014-01-08,8.8,3.0,16.3,175.650289,11.281875
8,C001,2014-01-09,9.9,3.9,17.1,175.650289,4.8
9,C001,2014-01-10,10.8,7.0,17.2,149.0,5.8


In [121]:
# Split the dataset into separate dataframes based on city_id
city_data = {}
for city_id in range(1, 113):  # Assuming city_id ranges from C001 to C100
    city_id_str = f'C{city_id:03}'  # Format city_id to match C001, C002, ..., C100
    city_data[city_id_str] = data[data['city_id'] == city_id_str]

# Now city_data contains separate DataFrames for each city_id

In [122]:
city_data['C006'].head(20)

Unnamed: 0,city_id,date,avg_temp_c,min_temp_c,max_temp_c,avg_wind_dir_deg,avg_wind_speed_kmh


Model

In [123]:
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings
warnings.filterwarnings('ignore')

In [124]:
# Define a function for SARIMA forecasting
def sarima_forecast(train, order, seasonal_order, steps):
    model = SARIMAX(train, order=order, seasonal_order=seasonal_order)
    model_fit = model.fit(disp=False)
    forecast = model_fit.get_forecast(steps=steps)
    forecast_mean = forecast.predicted_mean
    return forecast_mean


In [125]:
# Generate a list of strings from 'C001' to 'C100'
ordered_list = [f'C{i:03d}' for i in range(1, 113)]

# # Print the list
# for city in ordered_list:
#     print(city, end=', ')
#     print(city_data[city])

print(city_data['C112'])

       city_id        date  avg_temp_c  min_temp_c  max_temp_c  \
180538    C112  2014-01-23        26.3   14.627775   23.887261   
180539    C112  2014-01-24        24.8   15.900000   30.900000   
180540    C112  2014-01-25        22.6   16.400000   27.600000   
180541    C112  2014-01-26        22.9   14.627775   23.887261   
180542    C112  2014-01-27        20.1   11.100000   29.400000   
...        ...         ...         ...         ...         ...   
182333    C112  2018-12-27        22.0   15.900000   27.900000   
182334    C112  2018-12-28        21.9   14.900000   28.200000   
182335    C112  2018-12-29        22.4   16.300000   28.200000   
182336    C112  2018-12-30        21.6   18.500000   26.600000   
182337    C112  2018-12-31        20.2   17.100000   25.000000   

        avg_wind_dir_deg  avg_wind_speed_kmh  
180538        175.650289           11.281875  
180539        175.650289           11.281875  
180540        175.650289           11.281875  
180541        175.6

In [126]:
submission_data = []


for city in ordered_list:
    print(city, end=', ')
    data_original=city_data[city]
    data=city_data[city]

    # Skip null or empty DataFrames
    if data.empty:
        continue

    print(data.head())

    # Ensure the date column is the index and sorted
    data['date'] = pd.to_datetime(data['date'])
    data.set_index('date', inplace=True)
    data.sort_index(inplace=True)

    # Set parameters for SARIMA
    order = (1, 1, 1)
    seasonal_order = (1, 1, 1, 12)

    # Train data up to the end of 2018
    train_data = data['avg_temp_c'][:'2018-12-31']

    # Forecasting the first week of 2019 (7 days)
    forecast_horizon = 7
    forecast_dates = pd.date_range(start='2019-01-01', periods=forecast_horizon)

    # Forecast using SARIMA
    forecast_mean = sarima_forecast(train_data, order, seasonal_order, steps=forecast_horizon)

    # Create a DataFrame for the forecast
    forecast_df = pd.DataFrame({'date': forecast_dates, 'avg_temp_c': forecast_mean})
    forecast_df.set_index('date', inplace=True)

    # Print the forecasted temperatures
    print("Forecasted temperatures for the first week of 2019:")
    print(forecast_df)

    # Append the forecasted temperatures to submission_data
    submission_data.extend(forecast_mean.values)

# Generate submission DataFrame
submission_id = list(range(1, len(submission_data) + 1))
submission_df = pd.DataFrame({'submission_ID': submission_id, 'avg_temp_c': submission_data})



    

    



C001,   city_id        date  avg_temp_c  min_temp_c  max_temp_c  avg_wind_dir_deg  \
0    C001  2014-01-01         6.6        -1.4        11.6        168.000000   
1    C001  2014-01-02         9.3         6.3        13.3        155.000000   
2    C001  2014-01-03         7.6         1.9        14.0        175.650289   
3    C001  2014-01-04         7.6         3.9        13.3        291.000000   
4    C001  2014-01-05         8.6         0.5        16.9        175.650289   

   avg_wind_speed_kmh  
0                 6.2  
1                10.0  
2                 5.8  
3                11.3  
4                 5.0  
Forecasted temperatures for the first week of 2019:
            avg_temp_c
date                  
2019-01-01   11.729624
2019-01-02   11.546911
2019-01-03   11.254620
2019-01-04   10.918898
2019-01-05   10.719733
2019-01-06   10.676807
2019-01-07   10.783037
C002,      city_id        date  avg_temp_c  min_temp_c  max_temp_c  \
1826    C002  2014-01-01        16.6   14.6277

PermissionError: [Errno 13] Permission denied: 'submission.csv'

In [127]:
# Save to CSV
submission_df.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' has been created.")


Submission file 'submission.csv' has been created.


In [None]:
data_original=city_data['C001']
data=city_data['C001']
print(data.head())

           city_id  avg_temp_c  min_temp_c  max_temp_c  avg_wind_dir_deg  \
date                                                                       
2014-01-01    C001         6.6        -1.4        11.6        168.000000   
2014-01-02    C001         9.3         6.3        13.3        155.000000   
2014-01-03    C001         7.6         1.9        14.0        175.650289   
2014-01-04    C001         7.6         3.9        13.3        291.000000   
2014-01-05    C001         8.6         0.5        16.9        175.650289   

            avg_wind_speed_kmh  
date                            
2014-01-01                 6.2  
2014-01-02                10.0  
2014-01-03                 5.8  
2014-01-04                11.3  
2014-01-05                 5.0  


In [None]:
# Ensure the date column is the index and sorted
data['date'] = pd.to_datetime(data['date'])
data.set_index('date', inplace=True)
data.sort_index(inplace=True)

In [None]:
data.head()

Unnamed: 0_level_0,city_id,avg_temp_c,min_temp_c,max_temp_c,avg_wind_dir_deg,avg_wind_speed_kmh
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-01,C001,6.6,-1.4,11.6,168.0,6.2
2014-01-02,C001,9.3,6.3,13.3,155.0,10.0
2014-01-03,C001,7.6,1.9,14.0,175.650289,5.8
2014-01-04,C001,7.6,3.9,13.3,291.0,11.3
2014-01-05,C001,8.6,0.5,16.9,175.650289,5.0


In [None]:
# Set parameters for SARIMA
order = (1, 1, 1)
seasonal_order = (1, 1, 1, 12)

# Train data up to the end of 2018
train_data = data['avg_temp_c'][:'2018-12-31']

In [None]:
# Forecasting the first week of 2019 (7 days)
forecast_horizon = 7
forecast_dates = pd.date_range(start='2019-01-01', periods=forecast_horizon)

# Forecast using SARIMA
forecast_mean = sarima_forecast(train_data, order, seasonal_order, steps=forecast_horizon)

# Create a DataFrame for the forecast
forecast_df = pd.DataFrame({'date': forecast_dates, 'avg_temp_c': forecast_mean})
forecast_df.set_index('date', inplace=True)


In [None]:
# Print the forecasted temperatures
print("Forecasted temperatures for the first week of 2019:")
print(forecast_df)



Forecasted temperatures for the first week of 2019:
            avg_temp_c
date                  
2019-01-01   11.729624
2019-01-02   11.546911
2019-01-03   11.254620
2019-01-04   10.918898
2019-01-05   10.719733
2019-01-06   10.676807
2019-01-07   10.783037


In [None]:

import numpy as np

# Create submission DataFrame with submission_id starting from 1
submission = forecast_df.reset_index()
submission['submission_id'] = np.arange(1, len(submission) + 1)
submission = submission[['submission_id', 'date', 'avg_temp_c']]

# Write the submission DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

# Print the submission DataFrame to verify
print("Submission DataFrame:")
print(submission)

Submission DataFrame:
   submission_id       date  avg_temp_c
0              1 2019-01-01   11.729624
1              2 2019-01-02   11.546911
2              3 2019-01-03   11.254620
3              4 2019-01-04   10.918898
4              5 2019-01-05   10.719733
5              6 2019-01-06   10.676807
6              7 2019-01-07   10.783037


In [None]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from datetime import datetime

In [None]:


count=1
for city_df in city_data:
    city_df['date'] = pd.to_datetime(city_df['date'])
    city_df.set_index('date', inplace=True)
    
    # Assuming 'avg_temp_c' is the target variable
    y = city_df['avg_temp_c']
    
    # ARIMA model example (you may need to tune parameters)
    model = ARIMA(y, order=(1, 1, 1))  # Example ARIMA(1,1,1) model
    model_fit = model.fit()
    
    # Forecasting for the first week of 2019
    start_date = datetime(2019, 1, 1)
    end_date = datetime(2019, 1, 7)
    forecast = model_fit.predict(start=start_date, end=end_date, typ='levels')
    
    # Create submission DataFrame for this city
    submission_df = pd.DataFrame({
        'submission_ID': count,
        'avg_temp_c': forecast.values })
    


TypeError: string indices must be integers

In [None]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from datetime import datetime

# Assuming city_data is a list of DataFrames, each containing historical weather data for a city
# Assume submission_key.csv maps submission_ID to city_id and date for the first week of 2019

# Function to perform time series forecasting for a city
def forecast_for_city(city_id):
    city_id_str = f'C{city_id:03}'  # Format city_id to match C001, C002, ..., C100
    
    # Filter city data and submission key for the current city_id
    city_df = next(df for df in city_data if df['city_id'].iloc[0] == city_id_str)
    city_submission_key = submission_key[submission_key['city_id'] == city_id_str]
    
    # Assuming 'date' is a column in city_df representing dates
    # Convert 'date' column to datetime if necessary
    city_df['date'] = pd.to_datetime(city_df['date'])
    city_df.set_index('date', inplace=True)
    
    # Assuming 'avg_temp_c' is the target variable
    y = city_df['avg_temp_c']
    
    # ARIMA model example (you may need to tune parameters)
    model = ARIMA(y, order=(1, 1, 1))  # Example ARIMA(1,1,1) model
    model_fit = model.fit()
    
    # Forecasting for the first week of 2019
    start_date = datetime(2019, 1, 1)
    end_date = datetime(2019, 1, 7)
    forecast = model_fit.predict(start=start_date, end=end_date, typ='levels')
    
    # Create submission DataFrame for this city
    submission_df = pd.DataFrame({
        'submission_ID': city_submission_key['submission_ID'],
        'avg_temp_c': forecast.values
    })
    
    return submission_df

# Assuming city_data is a list of DataFrames containing historical weather data for each city
# and submission_key is a DataFrame mapping submission_ID to city_id and date
submission_dfs = []

# Iterate through each city in city_data
for city_id in range(1, 101):  # Adjust range based on your city_id format
    submission_dfs.append(forecast_for_city(city_data, city_id, submission_key))

# Combine all submission DataFrames into one DataFrame
submission_final = pd.concat(submission_dfs, ignore_index=True)

# Write submission to CSV file
submission_final.to_csv('submission.csv', index=False)


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Assuming you have loaded historical weather data into a DataFrame 'weather_data'
# and have a DataFrame 'submission_key' mapping submission IDs to city IDs and dates

# Iterate over each city_id (e.g., C001 to C100)
for city_id in range(1, 101):  # Adjust range based on your city_id format
    city_id_str = f'C{city_id:03}'  # Format city_id to match C001, C002, ..., C100
    
    # Filter weather_data and submission_key for the current city_id
    city_weather_data = data[data['city_id'] == city_id_str]
    city_submission_key = submission_df[submission_df['city_id'] == city_id_str]
    
    # Feature engineering (example: using average historical temperature)
    X = city_weather_data[['date_feature1', 'date_feature2', ...]]  # Select relevant features
    y = city_weather_data['avg_temp']  # Target variable (average temperature)
    
    # Train model (example: RandomForestRegressor)
    model = RandomForestRegressor()
    model.fit(X, y)
    
    # Predict for the first week of 2019 (example: assuming prediction_dates are known)
    prediction_dates = pd.date_range(start='2019-01-01', end='2019-01-07')
    predictions = model.predict(prediction_dates)
    
    # Create submission DataFrame for this city
    submission_df = pd.DataFrame({
        'submission_ID': city_submission_key['submission_ID'],
        'avg_temp_c': predictions
    })
    
    # Append to overall submission file or write separately for each city
    if city_id == 1:
        submission_df.to_csv('submission.csv', index=False)
    else:
        submission_df.to_csv('submission.csv', mode='a', header=False, index=False)


NameError: name 'submission_df' is not defined