## Imports

In [160]:
pip install catboost



In [None]:
pip install pyupset



In [None]:
!pip install upsetplot

In [None]:
import gc
import os
import itertools
import pickle
import re
import time

import warnings
warnings.filterwarnings('ignore')

from random import choice, choices
from functools import reduce
from tqdm import tqdm
from itertools import cycle

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
from upsetplot import UpSet
%matplotlib inline

from functools import reduce
from tqdm import tqdm
from itertools import cycle
from scipy import stats
from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import ensemble
from sklearn import decomposition
from sklearn import tree


from catboost import CatBoostRegressor, Pool

pd.set_option("display.max_columns", None)

plt.style.use("ggplot")
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"])

## Data Loading

In [None]:
pcp_df = pd.read_csv("/content/imgs_202307101549519358.csv")
cl_df = pd.read_csv("/content/imgs_2023071012130978799.csv")
bs_df = pd.read_csv("/content/imgs_2023071012123392536.csv")
ec_df  = pd.read_csv("/content/imgs_2023071012133740345.csv")
ss_df = pd.read_csv("/content/SampleSubmission (2).csv")

pcp_df.shape, cl_df.shape, bs_df.shape, ec_df.shape, ss_df.shape

In [None]:
ec_df.head()

In [None]:
cl_df.head()

In [None]:
bs_df.head()

In [None]:
pcp_df.head()

In [None]:
pcp_df['w'].value_counts()

## Data Pre-Processing

In [None]:
def rename_columns(df):
    df.rename(columns=lambda x: x.lower().replace(' ', '_'), inplace=True)
    return df

In [None]:
pcp_df = rename_columns(pcp_df)
cl_df = rename_columns(cl_df)
bs_df = rename_columns(bs_df)
ec_df = rename_columns(ec_df)

In [None]:
cl_df.head()

In [None]:
bs_df.head()

In [None]:
ec_df.head()

In [None]:
pcp_df.head()

In [None]:
train_bs_set = set(ec_df['bs'].values.tolist())
test_bs_set = set(pcp_df['bs'].values.tolist())

venn2([train_bs_set, test_bs_set], ('Train BS', 'Test BS'))
plt.show()

In [None]:
cl_df['time'] = pd.to_datetime(cl_df['time'])
ec_df['time'] = pd.to_datetime(ec_df['time'])
pcp_df['time'] = pd.to_datetime(pcp_df['time'])

In [None]:
data = {
    'Train BS': train_bs_set,
    'Test BS': test_bs_set
}


In [None]:
cl_bs_df = cl_df.merge(bs_df, on=['bs', 'cellname'], how='left')
cl_bs_df.head()

In [None]:
cl_bs_df[cl_bs_df['bs'] == 'B_0']

In [None]:
cl_bs_df.groupby("bs")['cellname'].nunique()

In [None]:
cl_bs_df.groupby("bs")['cellname'].nunique().value_counts()

In [None]:
ec_df.head()

In [None]:
cl_bs_df.head()

In [None]:
cl_bs_df = cl_bs_df.pivot(
    index=['time', 'bs'],
    columns=['cellname'],
    values=['load', 'esmode1', 'esmode2', 'esmode3',
       'esmode4', 'esmode5', 'esmode6', 'frequency',
       'bandwidth', 'antennas', 'txpower'],
).reset_index()

cl_bs_df.columns = ['_'.join([str(i) for i in x]) for x in cl_bs_df.columns]
cl_bs_df.columns = cl_bs_df.columns.str.strip('_')
cl_bs_df = rename_columns(cl_bs_df)
cl_bs_df = cl_bs_df.merge(bs_df.groupby('bs')[['rutype', 'mode']].first().reset_index(), on='bs', how='left')

In [None]:
cl_bs_df.head()

In [None]:
cl_bs_df.describe()

In [None]:
ec_df.head()

In [None]:
df = cl_bs_df.merge(ec_df, on=['time', 'bs'], how='left')

In [None]:
df.head()

In [None]:
df['energy'].isna().value_counts()

In [None]:
df['split'] = df['energy'].isna().apply(lambda x: 'test' if x == True else 'train')

In [None]:
ss_df.shape

In [None]:
ec_df.shape

## EDA

In [None]:
df['energy'].describe()

In [None]:

sns.set(style="whitegrid")
plt.figure(figsize=(15, 5))
sns.histplot(df['energy'], bins=30, kde=True)
plt.show()

In [None]:
plt.figure(figsize=(15, 5))
df['energy'].hist()
plt.show()

In [None]:
plt.figure(figsize=(15, 5))
plt.hist(df['energy'], bins=30, edgecolor='k', alpha=0.7)
plt.xlabel('Energy')
plt.ylabel('Frequency')
plt.title('Energy Histogram')
plt.grid(axis='y')
plt.show()

In [None]:
plt.figure(figsize=(15, 5))
sns.histplot(df['energy'])
plt.show()

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(df['time'], df['energy'], color='b', marker='o', linestyle='-', linewidth=2, markersize=5)
plt.xlabel('Time')
plt.ylabel('Energy')
plt.title('Energy over Time')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(20, 5))
sns.lineplot(data=df, x='time', y='energy')
plt.show()

In [None]:
plt.figure(figsize=(20, 5))
sns.lineplot(data=df, x='time', y='energy', hue='rutype')
plt.show()

In [None]:
unique_rutypes = df['rutype'].unique()

plt.figure(figsize=(20, 5))

# Loop through each unique 'rutype' and plot a separate line for each
for rutype in unique_rutypes:
    subset_df = df[df['rutype'] == rutype]
    plt.plot(subset_df['time'], subset_df['energy'], marker='o', linestyle='-', linewidth=2, markersize=5, label=rutype)

# Setting the labels and title
plt.xlabel('Time')
plt.ylabel('Energy')
plt.title('Energy over Time')

# Adding a legend to indicate which line corresponds to which 'rutype'
plt.legend(title='RuType', title_fontsize='13', loc='upper left')

# Displaying the grid
plt.grid(True)

# Displaying the plot
plt.show()

In [None]:
plt.figure(figsize=(20, 5))
sns.lineplot(data=df, x='time', y='energy', hue='mode')
plt.show()

In [None]:
unique_modes = df['mode'].unique()

plt.figure(figsize=(20, 5))

# Loop through each unique 'mode' and plot a separate line for each
for mode in unique_modes:
    subset_df = df[df['mode'] == mode]
    plt.plot(subset_df['time'], subset_df['energy'], marker='o', linestyle='-', linewidth=2, markersize=5, label=mode)

# Setting the labels and title
plt.xlabel('Time')
plt.ylabel('Energy')
plt.title('Energy over Time based on Modes')

# Adding a legend to indicate which line corresponds to which 'mode'
plt.legend(title='Mode', title_fontsize='13', loc='upper left')

# Displaying the grid
plt.grid(True)

# Displaying the plot
plt.show()

In [None]:
plt.figure(figsize=(20, 3))
sns.lineplot(data=df, x='time', y='energy', hue='antennas_cell0')
plt.show()

plt.figure(figsize=(20, 3))
sns.lineplot(data=df, x='time', y='energy', hue='antennas_cell1')
plt.show()

plt.figure(figsize=(20, 3))
sns.lineplot(data=df, x='time', y='energy', hue='antennas_cell2')
plt.show()

plt.figure(figsize=(20, 3))
sns.lineplot(data=df, x='time', y='energy', hue='antennas_cell3')
plt.show()

In [None]:
antenna_cols = ['antennas_cell0', 'antennas_cell1', 'antennas_cell2', 'antennas_cell3']

for antenna_col in antenna_cols:
    # Get unique values in the current antenna column
    unique_antennas = df[antenna_col].unique()

    plt.figure(figsize=(20, 3))

    for antenna in unique_antennas:
        # Create a subset of the dataframe where the current antenna column equals the current antenna value
        subset_df = df[df[antenna_col] == antenna]

        # Plot the subset data
        plt.plot(subset_df['time'], subset_df['energy'], marker='', linestyle='-', linewidth=2, label=antenna)

    # Set labels and title
    plt.xlabel('Time')
    plt.ylabel('Energy')
    plt.title(f'Energy over Time based on {antenna_col}')

    # Add a legend to indicate which line corresponds to which antenna value
    plt.legend(title=antenna_col, title_fontsize='13', loc='upper left')

    # Display the grid
    plt.grid(True)

    # Display the plot
    plt.show()

In [None]:
plt.figure(figsize=(20, 3))
sns.lineplot(data=df, x='time', y='energy', hue='bandwidth_cell0')
plt.show()

plt.figure(figsize=(20, 3))
sns.lineplot(data=df, x='time', y='energy', hue='bandwidth_cell1')
plt.show()

plt.figure(figsize=(20, 3))
sns.lineplot(data=df, x='time', y='energy', hue='bandwidth_cell2')
plt.show()

plt.figure(figsize=(20, 3))
sns.lineplot(data=df, x='time', y='energy', hue='bandwidth_cell3')
plt.show()

In [None]:


# List of bandwidth columns
bandwidth_cols = ['bandwidth_cell0', 'bandwidth_cell1', 'bandwidth_cell2', 'bandwidth_cell3']

for bandwidth_col in bandwidth_cols:
    plt.figure(figsize=(20, 3))
    sns.lineplot(data=df, x='time', y='energy', hue=bandwidth_col)
    plt.show()

In [None]:
for fe in ['rutype', 'mode', 'bandwidth_cell0', 'bandwidth_cell1', 'antennas_cell0', 'antennas_cell1']:
    plt.figure(figsize=(15, 5))
    sns.violinplot(data=df, x=fe, y='energy')
    plt.show()

In [None]:
def plot_cat_train_test_dist(df, fe_name, hue='split'):

    # Create a figure with two subplots
    fig, axes = plt.subplots(1, 3, figsize=(20, 5))

    # Plot the histogram plot on the first subplot
    sns.histplot(data=df[df['split'] == 'train'], ax=axes[0], x=fe_name, discrete=True)
    axes[0].set_title('Train Histogram Plot')

    # Plot the histogram plot on the second subplot
    sns.histplot(data=df[df['split'] == 'test'], ax=axes[1], x=fe_name, discrete=True)
    axes[1].set_title('Test Histogram Plot')

    # Plot the combined histogram plot on the third subplot
    sns.histplot(data=df, ax=axes[2], x=fe_name, hue=hue, discrete=True)
    axes[2].set_title('Train & Test Histogram Plot')

    plt.show()

In [None]:
feature_params_list = [
    {'fe_name': 'rutype', 'hue': 'split'},
    {'fe_name': 'mode', 'hue': 'split'},
    {'fe_name': 'bandwidth_cell0', 'hue': 'split'},
    {'fe_name': 'bandwidth_cell1', 'hue': 'split'},
    {'fe_name': 'antennas_cell0', 'hue': 'split'},
    {'fe_name': 'antennas_cell1', 'hue': 'split'}
]

for params in feature_params_list:
    plot_cat_train_test_dist(df=df, **params)

In [None]:
pivot_table_result = df.pivot_table(index='split', columns='rutype', aggfunc='size', fill_value=0)
print(pivot_table_result)


In [None]:
plt.figure(figsize=(20, 5))

cell_loads = ['load_cell0', 'load_cell1', 'load_cell2', 'load_cell3']
for load in cell_loads:
    sns.scatterplot(data=df[df['split'] == 'train'], x=load, y='energy', label=load)

plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(20, 5))

txpower_cells = ['txpower_cell0', 'txpower_cell1', 'txpower_cell2', 'txpower_cell3']
for txpower in txpower_cells:
    sns.scatterplot(data=df[df['split'] == 'train'], x=txpower, y='energy', label=txpower)

plt.legend()
plt.show()


## Modeling

In [None]:
train_data = df[df['split'] =='train']
test_data = df[df['split'] =='test']

train_data.shape, test_data.shape

In [None]:
def feature_enginning(train_df, valid_df, test_df):

    train_df['hour'] = train_df['time'].dt.hour
    valid_df['hour'] = valid_df['time'].dt.hour
    test_df['hour'] = test_df['time'].dt.hour

    # target_encoder = TargetEncoder(min_samples_leaf=1)
    # train_df[['te_rutype', 'te_mode', 'te_hour', 'te_bs']] = target_encoder.fit_transform(X=train_df[['rutype', 'mode', 'hour', 'bs']], y=train_df['energy'])
    # valid_df[['te_rutype', 'te_mode', 'te_hour', 'te_bs']] = target_encoder.transform(X=valid_df[['rutype', 'mode', 'hour', 'bs']])
    # test_df[['te_rutype', 'te_mode', 'te_hour', 'te_bs']] = target_encoder.transform(X=test_df[['rutype', 'mode', 'hour', 'bs']])

    train_df['split'] = 'train'
    valid_df['split'] = 'valid'
    test_df['split'] = 'test'

    df = pd.concat([train_df, valid_df, test_df])
    df['bs_en'] = df['bs'].apply(lambda x: int(x.strip('B_')))

    # df['rutype'] = df['rutype'].apply(lambda x: int(x.strip('Type')))
    # df['mode'] = df['mode'].apply(lambda x: int(x.strip('Mode')))

    df = pd.get_dummies(df, columns=['rutype',  'mode', 'hour'])

    df.sort_values(['time', 'bs'], inplace=True)
    # for fe in ['antennas_cell0', 'antennas_cell1', 'antennas_cell2', 'antennas_cell3', 'load_cell0', 'load_cell1',
    #    'load_cell2', 'load_cell3', 'txpower_cell0', 'txpower_cell1', 'txpower_cell2', 'txpower_cell3']:
    #     for r in [3, 5, 8, 10, 15]:
    #         df[f'r{r}_mean_{fe}'] = df.groupby(['bs'])[f'{fe}'].transform(lambda x: x.rolling(r, min_periods=1).mean())
    #         df[f'r{r}_std_{fe}'] = df.groupby(['bs'])[f'{fe}'].transform(lambda x: x.rolling(r, min_periods=1).std())
    #         df[f'r{r}_skew_{fe}'] = df.groupby(['bs'])[f'{fe}'].transform(lambda x: x.rolling(r, min_periods=1).skew())
    #         df[f'r{r}_max_{fe}'] = df.groupby(['bs'])[f'{fe}'].transform(lambda x: x.rolling(r, min_periods=1).max())

    # Target rolling
    # for fe in ['energy']:
    #     for r in [3, 5, 8, 10, 15]:
    #         df[f'r{r}_mean_{fe}'] = df.groupby(['bs'])[f'{fe}'].transform(lambda x: x.rolling(r, min_periods=1).mean().shift(1))
    #         df[f'r{r}_std_{fe}'] = df.groupby(['bs'])[f'{fe}'].transform(lambda x: x.rolling(r, min_periods=1).std().shift(1))
    #         df[f'r{r}_skew_{fe}'] = df.groupby(['bs'])[f'{fe}'].transform(lambda x: x.rolling(r, min_periods=1).skew().shift(1))
    #         df[f'r{r}_max_{fe}'] = df.groupby(['bs'])[f'{fe}'].transform(lambda x: x.rolling(r, min_periods=1).max().shift(1))


    train_df = df[df['split'] =='train']
    valid_df = df[df['split'] =='valid']
    test_df = df[df['split'] =='test']

    return train_df, valid_df, test_df

In [None]:
target_col = 'energy'
drop_cols = ['time', 'bs', 'split', target_col]

In [None]:
kf = model_selection.KFold(n_splits=5)
kf = kf.split(X=train_data)

oof_valid_preds = np.zeros(train_data.shape[0], )
test_preds_list = []

for i, (train_idx, valid_idx) in enumerate(kf):

    train_df = train_data.iloc[train_idx]
    valid_df = train_data.iloc[valid_idx]
    test_df = test_data.copy()

    train_df, valid_df, test_df = feature_enginning(train_df=train_df, valid_df=valid_df, test_df=test_df)
    train_cols = [col for col in train_df.columns if col not in drop_cols]

    X_train, y_train = train_df[train_cols], train_df[target_col]
    X_valid, y_valid = valid_df[train_cols], valid_df[target_col]
    X_test = test_df[train_cols]

    params = {
        'loss_function': 'MAE',
        'iterations': 10000,
        'depth': 6,
        'learning_rate': 0.1,
        'thread_count': -1,
        'verbose': 100,
        'task_type': 'CPU'
    }

    model = CatBoostRegressor(**params)

    train_pool = Pool(data=X_train, label=y_train)
    val_pool = Pool(data=X_valid, label=y_valid)

    model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=100)

    valid_preds = model.predict(X_valid)
    test_preds = model.predict(X_test)

    val_score = metrics.mean_absolute_error(y_valid, valid_preds)
    oof_valid_preds[valid_idx] = valid_preds
    test_preds_list.append(test_preds)

    print("=*"*50)
    print(f"Fold : {i}")
    print(f"Valid score : ", val_score)

oof_score = metrics.mean_absolute_error(train_data[target_col], oof_valid_preds)
print("_-*"*50)
print(f"OOF score : ", oof_score)

In [None]:
test_preds_list

In [None]:
test_preds_mean = np.mean(test_preds_list, axis=0)

In [None]:
test_preds_mean

In [None]:
test_preds_mean.shape

In [None]:
test_data.shape

In [None]:
test_data['Energy'] = test_preds_mean

In [None]:
test_data['Energy'].hist()

In [None]:
ss_df.head()

In [None]:
test_data['ID'] = test_data['time'].astype('str') + '_' + test_data['bs']

In [None]:
test_data[['ID', 'Energy']]

In [None]:
ss_df = ss_df[['ID']]
ss_df = ss_df.merge(test_data[['ID', 'Energy']], on='ID', how='left')

In [None]:
ss_df.head()

In [None]:
ss_df.to_csv("base_model_submission.csv", index=False)