In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import random
from random import randint
from sklearn.preprocessing import StandardScaler

random.seed(77)

import warnings
warnings.filterwarnings('ignore')

# 1. Clustering

Clustering might help locating locations that have similar emission values to help improve the model's RMSE. I am going to use KMeans and DBSCAN to do the clustering.

In [None]:
from sklearn.cluster import DBSCAN, KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder

In [None]:
def plot_clusters(df):
    for ClusterNo in df['cluster'].sort_values().unique():
        df_plot = df[df['cluster']==ClusterNo].T.drop('cluster')
        print('Cluster #: ' + str(ClusterNo))    
        print('# of Locations: ' + str(len(df_plot.T)))
        if ClusterNo == -1:
            color = 'gray'
        else:
            color = 'k'
        df_plot.plot(figsize=(10,3),color=color,legend=False)
        plt.show()
        print('-----------------------------------------------------------------------------------')

In [None]:
def rgba_to_hex(color):
    """Return color as #rrggbb for the given color values."""
    red, green, blue, alpha = color
    return f"#{int(red*255):02x}{int(green*255):02x}{int(blue*255):02x}"

In [None]:
def plot_map(df, n_clusters):
    obj_map = folium.Map(prefer_canvas=True)
    coords_cluster = df['cluster'].reset_index().values
    colors = matplotlib.colormaps.get_cmap('Dark2')(range(n_clusters))

    for lat, lon, cluster in coords_cluster:
        if cluster == -1:
                color='#000000'
        else:
            color = rgba_to_hex(colors[int(cluster)])
        folium.CircleMarker(
            location=[lat, lon],
            radius=1.5,
            weight=6,
            color=color,
            tooltip=int(cluster)
        ).add_to(obj_map)
    obj_map.fit_bounds(obj_map.get_bounds())
    return obj_map

In [None]:
def plot_tsne(df, n_components=2):
    df_plot = df.drop('cluster',axis=1)
    tsne = TSNE(n_components=n_components, random_state=77)
    tsne_results = tsne.fit_transform(df_plot)

    plt.scatter(tsne_results[:, 0], tsne_results[:, 1], c=df['cluster'], cmap='Dark2', s=20, alpha=0.6)
    plt.colorbar(label='Cluster #')
    plt.show()

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s3e20/train.csv',
                    index_col='ID_LAT_LON_YEAR_WEEK')
test = pd.read_csv('/kaggle/input/playground-series-s3e20/test.csv',
                   index_col='ID_LAT_LON_YEAR_WEEK')
print('# of rows and columns in train:',train.shape)
print('# of rows and columns in test:',test.shape)

In [None]:
train_enc = pd.read_csv('/kaggle/input/train-test-processed/train_proc.csv', 
                        index_col='ID_LAT_LON_YEAR_WEEK')
test_enc = pd.read_csv('/kaggle/input/train-test-processed/test_proc.csv', 
                       index_col='ID_LAT_LON_YEAR_WEEK')
print('# of rows and columns in train-encoded:',train_enc.shape)
print('# of rows and columns in test-encoded:',test_enc.shape)

In [None]:
enc = LabelEncoder().fit(train_enc['country'])
train_enc['country'] = enc.transform(train_enc['country'])
test_enc['country'] = enc.transform(test_enc['country'])
train_enc['country']

In [None]:
emission_ts = train_enc.pivot_table(index=['year','week_no'], 
                                    columns=['latitude', 'longitude'], 
                                    values='emission')

emission_ts = (emission_ts-emission_ts.min())/(emission_ts.max()-emission_ts.min())
emission_ts = emission_ts.dropna(axis=1)
emission_ts

In [None]:
emission_ts.plot(figsize=(12,5),legend=False, color='gray', alpha=0.5)
# plt.ylim(0, 0.2)

## a. DBSCAN (Density-Based)

In [None]:
eps = .5
cluster_df = emission_ts.T.copy(deep=True)
dbscan = DBSCAN(eps=eps)
labels = dbscan.fit_predict(cluster_df)
cluster_df['cluster'] = labels
n_clusters_dbs = len(set(labels)) - (1 if -1 in labels else 0)
n_clusters_dbs

I am using `0.5` epsilon to cluster the time series based on location. This resulted in 8 clusters + 1 outlier cluster. The parameter could be tuned to cluster the location better based on model.

In [None]:
plot_clusters(cluster_df)

In [None]:
plot_map(cluster_df, n_clusters_dbs)

In [None]:
plot_tsne(cluster_df)

## b. KMeans

In [None]:
n_clusters_km = 8
max_iter = 2000

cluster_df1 = emission_ts.T.copy(deep=True)
kmeans = KMeans(n_clusters=n_clusters_km, max_iter=max_iter, random_state=77).fit(cluster_df1)
labels = kmeans.labels_
cluster_df1['cluster'] = labels

In [None]:
plot_clusters(cluster_df1)

In [None]:
plot_map(cluster_df1, n_clusters_km)

In [None]:
plot_tsne(cluster_df1)

In [None]:
train_dbs = train_enc.merge(cluster_df['cluster'].reset_index(), on=['latitude', 'longitude'], how='left').fillna(n_clusters_dbs)
train_km = train_enc.merge(cluster_df1['cluster'].reset_index(), on=['latitude', 'longitude'], how='left').fillna(n_clusters_km)

train_basedbs = train_dbs.copy(deep=True)

## c. Clusters' emission by year

In [None]:
fig, ax = plt.subplots(5,2, figsize=(20,10))
ax = ax.ravel()
for i in range(-1,n_clusters_dbs):
    df_plot = train_dbs.loc[train_dbs.cluster==i]
    df_plot = df_plot.groupby(['year', 'week_no']).mean()
    df_plot = df_plot.set_index(pd.Index(list(range(len(df_plot)))))
#     print(df_plot.emission.index)
    ax[i].plot(df_plot.index, df_plot.emission.values)
    if i==-1:
        ax[i].set_title(f'Outliers Cluster')
    else:
        ax[i].set_title(f'Cluster {i}')
    for w_no in range(0, 160, 53):
        ax[i].axvline(w_no, linestyle='--', color='black')
plt.suptitle('Plot using DBScan Clustering', fontsize=18)
fig.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(4,2, figsize=(20,10))
ax = ax.ravel()
for i in range(n_clusters_km):
    df_plot = train_km.loc[train_km.cluster==i]
    df_plot = df_plot.groupby(['year', 'week_no']).mean()
    df_plot = df_plot.set_index(pd.Index(list(range(len(df_plot)))))
#     print(df_plot.emission.index)
    ax[i].plot(df_plot.index, df_plot.emission.values)
    ax[i].set_title(f'Cluster {i}')
    for w_no in range(0, 160, 53):
        ax[i].axvline(w_no, linestyle='--', color='black')
plt.suptitle('Plot using KMeans Clustering', fontsize=18)
fig.tight_layout()
plt.show()

Not much different between the two algorithms. DBScan has an outlier case while KMeans do not. The impact is one or more clusters in KMeans may include these outliers while DBScan has a separate outlier cluster. In addition, we can also see that on most cluster the COVID impacted heavily in seasonality. This will affect the model performance quite a lot. Let's see the bug discussed where week number is not aligned every year.

In [None]:
fig, ax = plt.subplots(5,2, figsize=(20,15))
ax = ax.ravel()
for i in range(-1,n_clusters_dbs):
    df_plot = train_dbs.loc[train_dbs.cluster==i]
    sns.lineplot(data=df_plot, x='week_no', y='emission', hue='year', ax=ax[i], errorbar=None, palette=['r', 'g', 'b'])
    if i==-1:
        ax[i].set_title(f'Outliers Cluster')
    else:
        ax[i].set_title(f'Cluster {i}')
plt.suptitle('Yearly plot using DBScan Clustering', fontsize=18)
fig.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(4,2, figsize=(20,15))
ax = ax.ravel()
for i in range(n_clusters_km):
    df_plot = train_km.loc[train_km.cluster==i]
    sns.lineplot(data=df_plot, x='week_no', y='emission', hue='year', ax=ax[i], errorbar=None, palette=['r', 'g', 'b'])
    ax[i].set_title(f'Cluster {i}')
plt.suptitle('Yearly plot using KMeans Clustering', fontsize=18)
fig.tight_layout()
plt.show()

DBScan Insights:
- Cluster 5 is probably the easiest cluster to predict due to seasonality occurance in May and October. We can adjust the shifting peaks to fit the pattern. 
- Cluster 6 is similar to cluster 5. We have to make a lot of adjustments in 2020-2021 to fit the pattern.
- Cluster 2 and 7 has significant change between the years. The COVID event significantly influence this. We can treat this as an outlier and do forward fill to fit the pattern.
- Cluster 3 and 4 has almost similar pattern for all years, we just need to fix the shifting peaks.
- Cluster 0 has different pattern that were shifted backward. This might be due to COVID.
- Outliers cluster has same pattern we just need to fix the COVID influence.

After clustering the locations, we can see that COVID has changed the seasonality. This could be due to recovery phase after COVID when maybe buildings that produced high emissions start much earlier than the defined pattern. We could adjust the pattern to fit the 2019 because this is the closest year where pattern is still fixed. Meanwhile we also need to take into account the average emission values during 2021 to translate it into 2022. 

Overall, we need to adjust 2021 seasonality to fit 2019 pattern. This is due to COVID in 2020 where it has impacted the seasonality heavily. We consider this as outliers. The three years data tell us that the year 2019 has the pattern, 2020 is where a big event occurs which shifted the season heavily, and 2021 is where the recovery phase started.

# 2. Adjusting peaks and COVID
I am going to use the DBScan technique for the model.  
**Update**: KMeans clustering works better

In [None]:
def plot_line(df ,cluster):
    plt.figure(figsize=(15,4))
    df_plot = df.loc[df.cluster==cluster]
    sns.lineplot(data=df_plot, x='week_no', y='emission', hue='year', errorbar=None, palette=['r', 'g', 'b'], marker='o')
    plt.title(f'Cluster {cluster}')
    for w_no in range(0,53,4):
        plt.axvline(w_no, linestyle='--', color='black', alpha=0.3)
    plt.xticks(range(0,53,4))
    plt.show()

## a. Cluster 0

In [None]:
plot_line(train_dbs, 0)

In [None]:
# for i in [0, 9, 17, 19, 29]:
#     train_dbs.loc[(train_dbs.year==2021)&(train_dbs.week_no==i)&(train_dbs.cluster==0), 'emission'] = np.nan
# train_dbs.loc[(train_dbs.year==2020)&(train_dbs.week_no==30)&(train_dbs.cluster==0), 'emission'] = np.nan
# train_dbs.fillna(method='bfill',inplace=True)
# plot_line(train_dbs, 0)

The COVID ranges from week 10 - week 23.

## b. Cluster 1

In [None]:
plot_line(train_dbs, 1)

In [None]:
# for i in [9, 17, 36, 37]:
#     train_dbs.loc[(train_dbs.year==2021)&(train_dbs.week_no==i)&(train_dbs.cluster==1), 'emission'] = np.nan
# for i in [22, 26]:
#     train_dbs.loc[(train_dbs.year==2020)&(train_dbs.week_no==i)&(train_dbs.cluster==1), 'emission'] = np.nan
# train_dbs.fillna(method='bfill',inplace=True)
# plot_line(train_dbs, 1)

The COVID ranges from week 13 - week 21.

## c. Cluster 2 and 7

In [None]:
plot_line(train_dbs, 2)

In [None]:
plot_line(train_dbs, 7)

In [None]:
# train_dbs.loc[(train_dbs.year==2020)&(train_dbs.week_no>=0)&(train_dbs.week_no<13)&(train_dbs.cluster.isin([2, 7])), 'emission'] = np.nan
# train_dbs.fillna(method='bfill',inplace=True)
# plot_line(train_dbs, 2)  

In [None]:
# plot_line(train_dbs, 7)

## d. Cluster 3

In [None]:
plot_line(train_dbs, 3)

In [None]:
# for i in [9, 15, 17, 20, 21]:
#     train_dbs.loc[(train_dbs.year==2021)&(train_dbs.week_no==i)&(train_dbs.cluster.isin([3,4])), 'emission'] = np.nan
# train_dbs.loc[(train_dbs.year==2020)&(train_dbs.week_no==48)|(train_dbs.week_no==39)&(train_dbs.cluster.isin([3,4])), 'emission'] = np.nan
# train_dbs.fillna(method='bfill',inplace=True)
# plot_line(train_dbs, 3)  

The COVID ranges from week 14 - week 23.

## e. Cluster 4

In [None]:
plot_line(train_dbs, 4)

In [None]:
# for i in [19,35]:
#     train_dbs.loc[(train_dbs.year==2021)&(train_dbs.week_no==i)&(train_dbs.cluster==4), 'emission'] = np.nan
# train_dbs.fillna(method='bfill',inplace=True)
# plot_line(train_dbs, 4)  

## f. Cluster 5

In [None]:
plot_line(train_dbs, 5)

In [None]:
# for i in [9,10,11,12, 14, 22, 30, 35]:
#     train_dbs.loc[(train_dbs.year==2020)&(train_dbs.week_no==i)&(train_dbs.cluster==5), 'emission'] = np.nan
# train_dbs.fillna(method='ffill',inplace=True)
# train_dbs.loc[(train_dbs.year==2020)&(train_dbs.week_no==13)&(train_dbs.cluster==5), 'emission'] = np.nan
# train_dbs.fillna(method='ffill',inplace=True)
# for i in [9,39]:
#     train_dbs.loc[(train_dbs.year==2021)&(train_dbs.week_no==i)&(train_dbs.cluster==5), 'emission'] = np.nan
# train_dbs.fillna(method='ffill',inplace=True)
# plot_line(train_dbs, 5)  

The COVID ranges from week 17 - week 22. This is a short one of all clusters because we smoothed out the noise and as can be seen the pattern is more reflective.

## g. Cluster 6

In [None]:
plot_line(train_dbs, 6)

In [None]:
# for i in [9,21,35]:
#     train_dbs.loc[(train_dbs.year==2021)&(train_dbs.week_no==i)&(train_dbs.cluster==6), 'emission'] = np.nan
# train_dbs.fillna(method='bfill',inplace=True)
# train_dbs.loc[(train_dbs.year==2021)&(train_dbs.week_no==39)&(train_dbs.cluster==6), 'emission'] = np.nan
# train_dbs.fillna(method='ffill',inplace=True)
# plot_line(train_dbs, 6)  

The COVID ranges from week 13 - week 23.

## h. Outliers Cluster

In [None]:
plot_line(train_dbs, -1)

We can smooth the noise but I won't do it here just to preserve the real values. The COVID ranges from week 9 - week 23.

So, we've adjusted the shifting peaks for each cluster in addition to smoothing the noise. I am going to exclude the COVID range by taking the minimum starting week of COVID ranges I listed until the maximum ending week of COVID ranges. The result is week 9 - week 23.

In [None]:
# fig, ax = plt.subplots(5,2, figsize=(20,15))
# ax = ax.ravel()
# for i in range(-1,n_clusters_dbs):
#     df_plot = train_dbs.loc[train_dbs.cluster==i]
#     sns.lineplot(data=df_plot, x='week_no', y='emission', hue='year', ax=ax[i], errorbar=None, palette=['r', 'g', 'b'])
#     if i==-1:
#         ax[i].set_title(f'Outliers Cluster')
#     else:
#         ax[i].set_title(f'Cluster {i}')
# plt.suptitle('Yearly plot using DBScan Clustering after adjustment', fontsize=18)
# fig.tight_layout()
# plt.show()

# 3. Modelling

I am going to build the model for performance comparison on following:

|   |Adjustment|Exclusion|
|---|----------|---------|
|a  |No        |No       |
|b  |**Yes**   |No       |
|c  |No        |**Yes**  |
|d  |**Yes**   |**Yes**  |

For validation set, I planned to validate it on first 20 weeks of 2022 (Public LB) and the rest of data (Private LB).

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
def exclude_covid(df):
    """
    args
    df: pandas dataframe
    return
    dataframe without the week 9 - 23 of 2020 (COVID)
    """
    df = df.loc[(df.year==2019)
                |((df.year==2020) & (df.week_no<9))
                |((df.year==2020) & (df.week_no>23))
                |(df.year==2021)
               ]
    return df

def encode_outliers(df):
    """
    args
    df: pandas dataframe
    return
    dataframe with cluster -1 changed to the last cluster encoding + 1 (this set -1 as the new last cluster)
    """
    df['cluster'] = np.where(df['cluster']==-1, max(df['cluster'])+1, df['cluster']) #set outliers cluster (-1) to last cluster + 1
    df['cluster'] = df['cluster'].astype(int)
    return df

def split(df, method='year'):
    """
    args
    df: pandas dataframe to be splitted
    method: Accepts year by default which is splitting on 2019 - 2020 and 2021.
            Otherwise specify either 'q1' or 'q2' to get validation on first 20 weeks and rest of data
            respectively.
    return
    X_train, y_train, X_val, y_val
    """
    training_set = df[df.year<2021]
    val_set = train_km[train_km.year==2021]
    
    if method=='q1':
        val_set = val_set.loc[val_set.week_no<21]
    elif method=='q2':
        val_set = val_set.loc[val_set.week_no>=21]
        
    X_train = training_set[features]
    y_train = training_set[target]

    X_val = val_set[features]
    y_val = val_set[target]

    return X_train, y_train, X_val, y_val

def compare_plot(X_val, y_val, y_hat):
    """
    args
    X_val: a dataframe or series numpy containing the true X values (has to be the same datatype as other args).
    y_val: a dataframe or series numpy containing the true y values (has to be the same datatype as other args).
    y_hat: a dataframe or series numpy containing the predicted y values (has to be the same datatype as other args).
    return
    no return value, shows plot of predicted vs true values mean aggregated by week_no.
    """
    X_train, y_train, X_val, y_val
    plt.figure(figsize=(10,5))
    pred = X_val.copy()
    pred['emission'] = y_hat
    true = pd.concat([X_val, y_val], axis=1)
    pred = pred.groupby('week_no').mean()
    true = true.groupby('week_no').mean()
    plt.plot(true.index, true.emission, label='true')
    plt.plot(pred.index, pred.emission, label='pred')
    plt.legend()
    plt.title('True vs. Prediction values')
    plt.show()
# change train_km to train_dbs for training with dbscan
base_train = train_km.copy(deep=True)
# uncomment for training with dbscan
# base_train = encode_outliers(base_train) 

features = ['week_no', 'location', 'cluster','country']
target = 'emission'

## a. With COVID

In [None]:
X_train, y_train, X_val, y_val = split(base_train)

In [None]:
model = RandomForestRegressor(n_estimators=200, random_state=77)
model.fit(X_train, y_train)
y_hat = model.predict(X_val)
rmse = mean_squared_error(y_val, y_hat, squared=False)
print('RMSE =', rmse)

In [None]:
compare_plot(X_val, y_val, y_hat)

## b. Adjustment

In [None]:
# train_adj = train_dbs.copy(deep=True)
# train_adj = encode_outliers(train_adj)

In [None]:
# X_train, y_train, X_val, y_val = split(train_adj)

In [None]:
# model.fit(X_train, y_train)
# y_hat = model.predict(X_val)
# rmse = mean_squared_error(y_val, y_hat, squared=False)
# print('RMSE =', rmse)

In [None]:
# compare_plot(X_val, y_val, y_hat)

Adjustment seems to be helpful in fitting the shifting peaks. We can see that the high error values are located between week 0 - week 9.

## c. COVID Exclusion

In [None]:
ex_basedf = exclude_covid(base_train)
X_train, y_train, X_val, y_val = split(ex_basedf, 'q1')

In [None]:
model.fit(X_train, y_train)
y_hat = model.predict(X_val)
rmse = mean_squared_error(y_val, y_hat, squared=False)
print('RMSE =', rmse)

In [None]:
compare_plot(X_val, y_val, y_hat)

We can see that from excluding COVID, the noise are reduced and are more fitted to true values. For example the week 22 without exclusion are very low compared to the exclusion

## d. Adjustment and COVID Exclusion

In [None]:
# ex_adjdf = exclude_covid(train_adj)
# X_train, y_train, X_val, y_val = split(ex_adjdf)

In [None]:
# model.fit(X_train, y_train)
# y_hat = model.predict(X_val)
# rmse = mean_squared_error(y_val, y_hat, squared=False)
# print('RMSE =', rmse)

In [None]:
# compare_plot(X_val, y_val, y_hat)

RMSE hits ~15 when adjusted and excluded. However we can still see higher values on early weeks. I am suspecting that this is due to not capturing some lower/higher trends on clusters. We can see from the previous plots by cluster that the year 2021 is the recovery phase. The early weeks of 2021 is highly sensitive to this.

## e. Training on 2019 - 2021

In [None]:
X_train_a = pd.concat([X_train, X_val])
y_train_a = pd.concat([y_train, y_val])

In [None]:
model.fit(X_train_a, y_train_a)
y_hat = model.predict(X_val)
rmse = mean_squared_error(y_val, y_hat, squared=False)
print('RMSE =', rmse)

In [None]:
compare_plot(X_val, y_val, y_hat)

Even when being trained on whole training set, the early weeks predictions are still higher than the true values.

# 4. Submission

In [None]:
test_km = test_enc.merge(cluster_df1['cluster'].reset_index(), on=['latitude', 'longitude'], how='left').fillna(n_clusters_km)
# test_dbs = encode_outliers(test_dbs)
test_km.head()

In [None]:
y_pred_test = model.predict(test_km[features])
sub_df = pd.DataFrame(y_pred_test, 
#                       index=test_enc.index, 
                      columns=['emission'])
sub_df

In [None]:
# visualising the predictions on 2022 data
test_km1 = test_km.copy()
test_km1['emission'] = y_pred_test
comb = pd.concat([train_km,test_km1])
plt.figure(figsize=(10,5))
sns.lineplot(data=comb, x='week_no', y='emission', hue='year', errorbar=None, palette='Dark2')

In [None]:
# visualising predictions on first 20 weeks of 2022 data (Public LB)
fig, ax = plt.subplots(4,2, figsize=(17,12))
ax = ax.ravel()
for i in range(n_clusters_km):
    df_plot = comb.loc[(comb.cluster==i)&(comb.week_no<20)]
    sns.lineplot(data=df_plot, x=df_plot['week_no'].astype(str), y='emission', 
                 hue='year', errorbar=None, palette='Dark2', ax=ax[i])
    ax[i].set_title(f'Cluster {i}')
fig.tight_layout()
plt.show()

In [None]:
# adjusting peak in week 13 and 9 on clusters
comb2 = comb.copy()
comb2.loc[(comb['cluster'].isin([1,4]))&(comb['week_no']==13), 'emission'] = np.nan
comb2.loc[(comb['cluster'].isin([3,7]))&(comb['week_no']==9), 'emission'] = np.nan
comb2 = comb2.fillna(method='bfill')

comb2.loc[(comb['cluster'].isin([6]))&(comb['week_no']==24), 'emission'] = np.nan
comb2 = comb2.fillna(method='ffill')

fig, ax = plt.subplots(4,2, figsize=(17,12))
ax = ax.ravel()
for i in range(n_clusters_km):
    df_plot = comb2.loc[(comb2.cluster==i)&(comb2.week_no<21)]
    sns.lineplot(data=df_plot, x=df_plot['week_no'].astype(str), y='emission', 
                 hue='year', errorbar=None, palette='Dark2', ax=ax[i])
    ax[i].set_title(f'Cluster {i}')
fig.tight_layout()
plt.show()

In [None]:
# applying to our submission df
sub_df.loc[(test_km1['cluster'].isin([1,4]))&(test_km1['week_no']==13), 'emission'] = np.nan
sub_df.loc[(test_km1['cluster'].isin([3,7]))&(test_km1['week_no']==9), 'emission'] = np.nan
sub_df.fillna(method='bfill', inplace=True)
sub_df.loc[(test_km1['cluster'].isin([6]))&(test_km1['week_no']==24), 'emission'] = np.nan
sub_df.fillna(method='ffill', inplace=True)

In [None]:
sub_df.set_index(test_enc.index, inplace=True)
sub_df

In [None]:
sub_df.to_csv('submission.csv')