# Sales Forecasting challenge for Novartis Datathon
- The dataset includes Jan 2012 – Dec 2017 anonymized data on sales, as well as Jan 2012 – Dec 2018 anonymized data on investments.
- The challenge is to forecast 2018 sales per country per brand and per month.

##### How close can you get to the real 2018 sales?

In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [4]:
df=pd.read_excel("datathon.xlsx",skiprows=3)
df.head(2)

Unnamed: 0,Cluster,Brand Group,Country,Function,Jan 2012,Feb 2012,Mar 2012,Unnamed: 7,Apr 2012,May 2012,...,Unnamed: 113,Jul 2018,Aug 2018,Sep 2018,Unnamed: 117,Oct 2018,Nov 2018,Dec 2018,Unnamed: 121,Unnamed: 122
0,Cluster 3,Brand Group 12,Country 19,Sales 1,1861.328751,1995.945017,1998.533792,5855.80756,2019.243986,2425.681558,...,,,,,,,,,,
1,Cluster 3,Brand Group 12,Country 19,Sales 2,1316.247027,1450.223283,1440.837462,4207.307772,1564.066421,2014.132904,...,,,,,,,,,,


##  Processing Sales Dataset
- Delete "Unnamed columns" (Total per month)
- Filter by "Sales 2" values.
- Drop values from 2018 (nulls).
- Group by (Cluster, Brand Group) and sum de values.
- Make pivot table.

In [7]:
df = df[df.columns.drop(list(df.filter(regex='Unnamed')))]
dfsales2=df.loc[df['Function'] == "Sales 2"]
dfsales2 = dfsales2[dfsales2.columns.drop(list(dfsales2.filter(regex='2018')))]
dfsales2=dfsales2.groupby(['Cluster','Brand Group']).agg('sum')
dfsales2.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Jan 2012,Feb 2012,Mar 2012,Apr 2012,May 2012,Jun 2012,Jul 2012,Aug 2012,Sep 2012,Oct 2012,...,Mar 2017,Apr 2017,May 2017,Jun 2017,Jul 2017,Aug 2017,Sep 2017,Oct 2017,Nov 2017,Dec 2017
Cluster,Brand Group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Cluster 1,Brand Group 12,616.408961,685.081035,968.93887,723.701936,656.607506,319.401731,953.191022,658.304347,837.083837,943.346418,...,1494.614561,1583.525099,1396.117515,1471.551885,1632.173591,1627.574308,1608.715411,1538.061964,1574.399619,1381.174249
Cluster 1,Brand Group 13,62.38147,69.445132,42.08108,55.139918,65.67736,36.194606,46.670999,62.483972,49.465439,58.313315,...,5.496543,14.480741,9.765432,11.74642,7.310123,10.965185,5.803457,5.608148,8.537778,6.361481


In [8]:
dfmelt=pd.melt(dfsales2.reset_index(), id_vars=["Cluster", "Brand Group"], 
                  var_name="Date", value_name="Value_Y")
dfmelt.head(5)

Unnamed: 0,Cluster,Brand Group,Date,Value_Y
0,Cluster 1,Brand Group 12,Jan 2012,616.408961
1,Cluster 1,Brand Group 13,Jan 2012,62.38147
2,Cluster 1,Brand Group 15,Jan 2012,0.0
3,Cluster 1,Brand Group 16,Jan 2012,2457.695992
4,Cluster 1,Brand Group 17,Jan 2012,0.0


## Encode months
- Encode month based on dictionary
- Add month number (0-11)

In [9]:
dictmonths=dict(zip(dfmelt["Date"].unique(),range(len(dfmelt["Date"].unique()))))

In [10]:
dfmelt["Date_coded"]=dfmelt["Date"].apply(lambda x: dictmonths[x])

In [11]:
dfmelt["Month"]=dfmelt["Date_coded"].apply(lambda x: x%12)

In [12]:
dfmelt.head(2)

Unnamed: 0,Cluster,Brand Group,Date,Value_Y,Date_coded,Month
0,Cluster 1,Brand Group 12,Jan 2012,616.408961,0,0
1,Cluster 1,Brand Group 13,Jan 2012,62.38147,0,0


## Add lag values
- Value_Y is the value to predict
- Previous month is the "actual value"
- Lag values for the last six months (lag-1 --> lag-6)

In [13]:
dfmelt["Actual"] = np.nan
dfmelt["Lag-1"] = np.nan
dfmelt["Lag-2"] = np.nan
dfmelt["Lag-3"] = np.nan
dfmelt["Lag-4"] = np.nan
dfmelt["Lag-5"] = np.nan
dfmelt["Lag-6"] = np.nan

In [15]:
## Function to add a specific lag value.
def add_lag_values(dfpre,lag,lagname):
    for i in range (len (dfpre)):
        if dfpre.iloc[i].Date_coded>12:
            dfpre.iloc[i,dfpre.columns.get_loc(lagname)]=dfpre[(dfpre["Cluster"]==dfpre.iloc[i].Cluster) 
                   & (dfpre["Brand Group"]==dfpre.iloc[i,dfpre.columns.get_loc("Brand Group")])
                                                                &(dfpre["Date_coded"]==dfpre.iloc[i].Date_coded-lag)]["Value_Y"].values[0]
    return dfpre

In [16]:
dfpre=add_lag_values(dfmelt,1,'Actual')
dfpre=add_lag_values(dfpre,2,'Lag-1')
dfpre=add_lag_values(dfpre,3,'Lag-2')
dfpre=add_lag_values(dfpre,4,'Lag-3')
dfpre=add_lag_values(dfpre,5,'Lag-4')
dfpre=add_lag_values(dfpre,6,'Lag-5')
dfpre=add_lag_values(dfpre,7,'Lag-6')

In [1570]:
#dfpre.to_pickle("lags_pre_12")

In [437]:
#dfpre=pd.read_pickle("lags_pre_12")

## Drop nan values

In [18]:
dfpre=dfpre.dropna()

## Adjust month feature
- Month - 1 (The month is from the "Actual" value no the value we want to predict (Value_Y)

In [20]:
dfpre['Month']=dfpre['Month'].apply(lambda x: 11 if x == 0 else x-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [21]:
dfpre.head(2)

Unnamed: 0,Cluster,Brand Group,Date,Value_Y,Date_coded,Month,Actual,Lag-1,Lag-2,Lag-3,Lag-4,Lag-5,Lag-6
3939,Cluster 1,Brand Group 12,Feb 2013,886.812584,13,0,745.704077,1084.703889,946.821916,943.346418,837.083837,658.304347,953.191022
3940,Cluster 1,Brand Group 13,Feb 2013,32.448225,13,0,47.174759,46.998244,80.600039,58.313315,49.465439,62.483972,46.670999


## Feature engineering: Add sales trends
-  Add short-trend sales: Trend using last 3 months
-  Add mid-trend sales: Trend using last 6 months

In [22]:
def add_trend(df, name,list_lags):
    df[name] = df['Actual']

    lag_list=[1,2,3]

    for lag in [1,2,3]:
        ft_name = ('Lag-%s' % lag)
        df[name] -= df[ft_name]

    df[name] /= len(lag_list) + 1
    return df

In [23]:
dfpre=add_trend(dfpre,"short_sales_trend",[1,2,3])
dfpre=add_trend(dfpre,"mid_sales_trend",[1,2,3,4,5,6])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [446]:
#dfpre['Date_coded']=dfpre['Date_coded'].apply(lambda x: x-1)

## Separate Features and labels

In [24]:
yvalues =dfpre["Value_Y"]
xvalues = dfpre.drop(labels=['Value_Y'],axis=1)

## Divide in train and test

In [25]:
X_train, X_valid, y_train, y_valid = train_test_split(xvalues, yvalues, test_size=0.15, shuffle=False)

## Get only feature columns

In [26]:
X_train=X_train[['Month', 'Actual',
       'Lag-1', 'Lag-2', 'Lag-3', 'Lag-4', 'Lag-5', 'Lag-6', 'short_sales_trend','mid_sales_trend']]
X_valid=X_valid[['Month', 'Actual',
       'Lag-1', 'Lag-2', 'Lag-3', 'Lag-4', 'Lag-5', 'Lag-6', 'short_sales_trend','mid_sales_trend']]

## List of models for ensembling

In [27]:
models=[]

# Train Support Vector Machine for Regression (SVR)

In [28]:
from xgboost import XGBRegressor
from sklearn.svm import SVR

In [31]:
regres=SVR(kernel = 'rbf',C=10000,gamma=.000000009)
regres.fit(X_train,y_train)

SVR(C=10000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=9e-09,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [32]:
predTest=regres.predict(X_valid)

In [33]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
print("MAE:",mean_absolute_error(y_valid, predTest))
print("MSE:",mean_squared_error(y_valid, predTest))

MAE: 258.27938694401496
MSE: 556718.8424905108


In [34]:
models.append(regres)

## Train XGBoost Regressor 

#### Hyperparameter Tunning

In [1375]:
xfrom sklearn.model_selection import GridSearchCV
model = XGBRegressor(eval_metric = 'rmse', early_stopping_rounds = 20)
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [6, 7],
              'min_child_weight': [15,20],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.8],
              'n_estimators': [1000]}

xgb_grid = GridSearchCV(model,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)

xgb_grid.fit(X_train,y_train,  eval_metric="rmse")

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

Fitting 2 folds for each of 27 candidates, totalling 54 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:  1.2min
[Parallel(n_jobs=5)]: Done  54 out of  54 | elapsed:  1.6min finished


0.9657134384098217
{'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_weight': 15, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}


#### Normal training with best parameters

In [35]:
model = XGBRegressor(
    max_depth=6,
    learning_rate=0.03,
    n_estimators=500,
    min_child_weight=17, 
    colsample_bytree=0.7, 
    subsample=0.7, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    y_train, 
  
    eval_set=[(X_train, y_train), (X_valid, y_valid)], 
    verbose=True, 
    early_stopping_rounds = 30)

[0]	validation_0-rmse:3971.89	validation_1-rmse:4556.83
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 30 rounds.
[1]	validation_0-rmse:3858.17	validation_1-rmse:4427.14
[2]	validation_0-rmse:3747.46	validation_1-rmse:4299.14
[3]	validation_0-rmse:3640.48	validation_1-rmse:4176.71
[4]	validation_0-rmse:3536.89	validation_1-rmse:4057.63
[5]	validation_0-rmse:3436.67	validation_1-rmse:3943.06
[6]	validation_0-rmse:3339.51	validation_1-rmse:3831.15
[7]	validation_0-rmse:3244.77	validation_1-rmse:3721.38
[8]	validation_0-rmse:3152.84	validation_1-rmse:3617.35
[9]	validation_0-rmse:3064.39	validation_1-rmse:3516.63
[10]	validation_0-rmse:2978.5	validation_1-rmse:3416.49
[11]	validation_0-rmse:2895.37	validation_1-rmse:3322.77
[12]	validation_0-rmse:2814.92	validation_1-rmse:3231.7
[13]	validation_0-rmse:2736.24	validation_1-rmse:3142.71
[14]	validation_0-rmse:2660.06	validation_1-rmse:3055.5

[141]	validation_0-rmse:533.694	validation_1-rmse:771.437
[142]	validation_0-rmse:533.069	validation_1-rmse:771.445
[143]	validation_0-rmse:532.514	validation_1-rmse:771.197
[144]	validation_0-rmse:532.047	validation_1-rmse:770.975
[145]	validation_0-rmse:531.588	validation_1-rmse:770.696
[146]	validation_0-rmse:531.19	validation_1-rmse:770.788
[147]	validation_0-rmse:530.546	validation_1-rmse:770.701
[148]	validation_0-rmse:530.066	validation_1-rmse:771.174
[149]	validation_0-rmse:529.664	validation_1-rmse:771.433
[150]	validation_0-rmse:529.059	validation_1-rmse:771.245
[151]	validation_0-rmse:528.64	validation_1-rmse:771.109
[152]	validation_0-rmse:528.371	validation_1-rmse:771.129
[153]	validation_0-rmse:528.01	validation_1-rmse:770.826
[154]	validation_0-rmse:527.216	validation_1-rmse:770.72
[155]	validation_0-rmse:526.752	validation_1-rmse:770.682
[156]	validation_0-rmse:526.084	validation_1-rmse:770.572
[157]	validation_0-rmse:525.724	validation_1-rmse:770.599
[158]	validation_0

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, eta=0.3, gamma=0, learning_rate=0.03,
       max_delta_step=0, max_depth=6, min_child_weight=17, missing=None,
       n_estimators=500, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=42, silent=True, subsample=0.7)

In [36]:
models.append(model)

## Train RandomForest Regressor

In [1416]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor(max_depth=14, random_state=0, n_estimators=75)
regr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=14,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=75, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [1417]:
p = regr.predict(X_valid)
print("MAE:",mean_absolute_error(y_valid, p))
print("MSE:",mean_squared_error(y_valid, p))

MAE: 270.43729730640075
MSE: 471365.5167566193


## Train LGBMRegressor

In [1429]:
from lightgbm import LGBMRegressor
mdl = LGBMRegressor( task= 'train',boosting_type= 'gbdt',objective= 'regression',num_leaves= 300, learning_rate= 0.32,feature_fraction= 0.9,
                    bagging_fraction= .9,bagging_freq= 70,verbose= 100)
mdl.fit(X_train, np.log1p(y_train))



LGBMRegressor(bagging_fraction=0.9, bagging_freq=70, boosting_type='gbdt',
       class_weight=None, colsample_bytree=1.0, feature_fraction=0.9,
       importance_type='split', learning_rate=0.32, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=300, objective='regression',
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
       task='train', verbose=100)

In [1434]:
p = mdl.predict(X_valid)
print("MAE:",mean_absolute_error(y_valid, p))
print("MSE:",mean_squared_error(y_valid, p))

MAE: 1933.2745963739865
MSE: 19682032.82794994


________________________________

# Prediction dataset

In [37]:
dfsales2=dfsales2.groupby(['Cluster','Brand Group']).agg('sum')

In [38]:
dfpred=pd.melt(dfsales2.reset_index(), id_vars=["Cluster", "Brand Group"], 
                  var_name="Date", value_name="Value")

In [39]:
dfindexes=dfsales2.reset_index()[["Cluster","Brand Group"]]

## Build features

- Get last 7 months for forecasting January 2018: December + 6 lag values

In [43]:
#      'Jan 2017', 'Feb 2017', 'Mar 2017', 'Apr 2017', 'May 2017', 
dfpred=dfsales2.reset_index()[[
 'Jun 2017', 'Jul 2017', 'Aug 2017', 'Sep 2017', 'Oct 2017', 'Nov 2017', 'Dec 2017']]

- Set month code to 11 (December)

In [44]:
dfpred["Month"]=11

- Set feature names

In [45]:
#       'Lag-11', 'Lag-10', 'Lag-9', 'Lag-8', 'Lag-7',
dfpred.columns=[
 'Lag-6', 'Lag-5', 'Lag-4',
       'Lag-3', 'Lag-2', 'Lag-1','Actual','Month']

- Build trend features

In [46]:
dfpred=add_trend(dfpred,"short_sales_trend",[1,2,3])
dfpred=add_trend(dfpred,"mid_sales_trend",[1,2,3,4,5,6])

In [47]:
dfpred.head(3)

Unnamed: 0,Lag-6,Lag-5,Lag-4,Lag-3,Lag-2,Lag-1,Actual,Month,short_sales_trend,mid_sales_trend
0,1471.551885,1632.173591,1627.574308,1608.715411,1538.061964,1574.399619,1381.174249,11,-835.000686,-835.000686
1,11.74642,7.310123,10.965185,5.803457,5.608148,8.537778,6.361481,11,-3.396975,-3.396975
2,-193.611296,180.088396,-126.788616,35.634313,177.38297,48.785139,-265.362597,11,-131.791255,-131.791255


### Function to add new prediction to dataframe and continue iterating

In [48]:
def add_values(pred_x, new_pred,month):
    pred_x['Month']=month
    
    pred_x["Lag-6"]=pred_x["Lag-5"]
    pred_x["Lag-5"]=pred_x["Lag-4"]
    pred_x["Lag-4"]=pred_x["Lag-3"]
    pred_x["Lag-3"]=pred_x["Lag-2"]
    pred_x["Lag-2"]=pred_x["Lag-1"]
    pred_x["Lag-1"]=pred_x["Actual"]
    pred_x["Actual"]=new_pred

    pred_x=add_trend(pred_x,"short_sales_trend",[1,2,3])
    pred_x=add_trend(pred_x,"mid_sales_trend",[1,2,3,4,5,6])

    return pred_x

In [54]:
list_predictions=[]

In [51]:
# 'Lag-7', 'Lag-8','Lag-9', 'Lag-10', 'Lag-11',
xpred=dfpred[['Month', 'Actual',
       'Lag-1', 'Lag-2', 'Lag-3', 'Lag-4', 'Lag-5', 'Lag-6', 'short_sales_trend','mid_sales_trend']]

## Make iterated prediction
- Forecast next month and use the prediction to make the forecasting of the following month
- Make average from predictions of model list

In [59]:
for i in range(12):
    preds = [ m.predict(xpred) for m in models ]
    preds = np.mean(preds,axis=0)
    #print(preds)
    #preds=regres.predict(xpred)
    list_predictions.append(preds)
    xpred=add_values(xpred, preds,i)
print(str(len(list_predictions))+ " months forecasted")

12 months forecasted


## Add predictions to dataframe

In [65]:
for d in range(len(list_predictions)):
    dfindexes['Month-%s' % d]=list_predictions[d]
dfindexes.head(2)

Unnamed: 0,Cluster,Brand Group,Month-0,Month-1,Month-2,Month-3,Month-4,Month-5,Month-6,Month-7,Month-8,Month-9,Month-10,Month-11
0,Cluster 1,Brand Group 12,1427.080889,1416.320656,1417.975999,1413.123843,1407.94019,1402.379285,1394.692753,1381.074859,1382.389444,1379.024398,1376.138039,1374.418002
1,Cluster 1,Brand Group 13,76.634345,82.043334,84.551089,86.758763,88.414902,89.524572,92.370598,90.145255,93.818287,94.456372,95.039358,95.698797


## Prepare submission

In [487]:
import numpy as np
dir_submission_template = 'Data_Novartis_Datathon-Results_Challenge1_Template.csv'

submission_template = pd.read_csv(dir_submission_template)

bg_set = set()

#a = pd.DataFrame(final_res, columns=['Cluster', 'Brand Group', 'res'])

for i, r in submission_template.iterrows():
    bg = r['Brand Group'][12:].replace(',', '').split(' ')
    bg_set.update(bg)
    bg = [ 'Brand Group ' + str(i) for i in bg ]
    print(r['Cluster'], r['Brand Group'], bg)
    
#    print(type(r['res']))
    
    res = dfindexes[dfindexes['Cluster'] == r['Cluster']]
    res = [ rs[2:] for j, rs in res.iterrows() if rs['Brand Group'] in bg ]
#    res = res['res']
    res = np.sum(res, axis=0)
    submission_template.iloc[i, 2:] = res
    
    
for i, r in submission_template[submission_template['Brand Group'] == 'others'].iterrows():
    print(r.Cluster)
    
    res = dfindexes[dfindexes['Cluster'] == r['Cluster']]
    res1 = [ rs[2:] for j, rs in res.iterrows() if rs['Brand Group'][12:] not in bg_set ]

    res2 = np.sum(res1, axis=0)
    submission_template.iloc[i, 2:] = res2

Cluster 1 Brand Group 17 ['Brand Group 17']
Cluster 2 Brand Group 17 ['Brand Group 17']
Cluster 3 Brand Group 17 ['Brand Group 17']
Cluster 4 Brand Group 17 ['Brand Group 17']
Cluster 5 Brand Group 17 ['Brand Group 17']
Cluster 8 Brand Group 17 ['Brand Group 17']
Cluster 9 Brand Group 17 ['Brand Group 17']
Cluster 10 Brand Group 17 ['Brand Group 17']
Cluster 1 Brand Group 24 ['Brand Group 24']
Cluster 3 Brand Group 24 ['Brand Group 24']
Cluster 4 Brand Group 24 ['Brand Group 24']
Cluster 5 Brand Group 24 ['Brand Group 24']
Cluster 8 Brand Group 24 ['Brand Group 24']
Cluster 9 Brand Group 24 ['Brand Group 24']
Cluster 10 Brand Group 24 ['Brand Group 24']
Cluster 1 Brand Group 30 ['Brand Group 30']
Cluster 2 Brand Group 30 ['Brand Group 30']
Cluster 4 Brand Group 30 ['Brand Group 30']
Cluster 5 Brand Group 30 ['Brand Group 30']
Cluster 7 Brand Group 30 ['Brand Group 30']
Cluster 8 Brand Group 30 ['Brand Group 30']
Cluster 9 Brand Group 30 ['Brand Group 30']
Cluster 10 Brand Group 30 ['Br

In [488]:
submission_template.to_csv("submission_merged.csv",index=False)