In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

### Load datasets

In [None]:
df = pd.read_csv("datasets/train-data.csv", sep=";")
df.drop_duplicates(inplace=True)
df['Product Life cycel status'].fillna('ACT', inplace=True)
df.sort_values(by="id_product", inplace=True, ignore_index=True)
df.drop(columns=['index'], inplace=True)
df.head(3)

In [None]:
def to_int(x):
    if pd.isna(x):
        return x
    return int(x.replace(" ", ""))

In [None]:
for j in range(1, 4):
    df[f'Month {j}'] = df[f'Month {j}'].apply(lambda x: to_int(x))

In [None]:
dates = list(df.Date.unique())
df_dates = {'sep-dec 2020': 0,
            'jan-apr 2021': 1,
            'may-aug 2021': 2,
            'sep-dec 2021': 3,
            'jan-apr 2022': 4,
            'may-aug 2022': 5,
            'sep-dec 2022': 6,
            'jan-apr 2023': 7,
            'may-jul 2023': 8
           }

In [None]:
df['order'] = df['Date'].apply(lambda date: df_dates[date])

In [None]:
id_product_missing_values = list(df.loc[df['Month 1'].isna()].id_product)
indices_missing_values = list(df.loc[df['Month 1'].isna()].index)

In [None]:
def get_df_product(id_product, index):
    last_index_down = index-1
    while last_index_down >= 0 and df.id_product.iloc[last_index_down] == id_product:
        last_index_down -= 1
    last_index_down += 1
    
    last_index_up = index+1
    while last_index_up < len(df) and df.id_product.iloc[last_index_up] == id_product:
        last_index_up += 1
    df_product = df.iloc[last_index_down: last_index_up]
    return df_product

In [None]:
def interpolate(l):
    return [l[i] if not pd.isna(l[i]) else l[i-1] + l[i+1] for i in range(len(l))]

def interpolate_missing_sales(id_product, index):
    df_product = get_df_product(id_product, index)
    df_product.sort_values(by='order', inplace=True)
    series = pd.Series([df_product[f'Month {i}'].iloc[j] for j in range(len(df_product)) for i in range(1, 4)])
    if len(series) <= 3:
        interpolated_val = (2*list(series)[1] + list(series)[2]) / 3
    else:
        interpolated_values = list(series.interpolate('polynomial', order=2))
        interpolated_val = interpolated_values[-4]
    df.loc[df.index == df_product.index[len(df_product)-1], 'Month 1'] = int(interpolated_val)

In [None]:
for i in tqdm(range(len(id_product_missing_values))):
    id_product = id_product_missing_values[i]
    index_missing = indices_missing_values[i]
    interpolate_missing_sales(id_product, index_missing)

In [None]:
df.drop(columns=['order'], inplace=True)

In [None]:
df

In [None]:
df.to_csv("datasets/train.csv", index=False)