In [59]:
import pandas as pd


## Data exploration

In [76]:
dataset = pd.read_csv('dataset.gz', sep=';')
dataset.Date = pd.to_datetime(dataset.Date)
dataset.head()

Unnamed: 0,Date,Fourni,Ventes,id
0,2020-01-02,10.0,5.0,-478139654568867546
1,2020-01-02,8.0,1.0,-8929187383922749181
2,2020-01-02,14.0,10.0,5557283775796994165
3,2020-01-02,2.0,0.0,5169110700785508591
4,2020-01-02,1.0,1.0,3502243892823802503


## Train/test split

In [83]:
train_dataset = dataset.loc[dataset.Date < ('2023-01-01')]
test_dataset = dataset.loc[dataset.Date >=('2023-01-01')]
print(train_dataset.shape, test_dataset.shape)

(76534, 4) (6817, 4)


## Evaluation

In [77]:
def unsold_rate(delivered: pd.Series, sales: pd.Series) -> float:
        """Measures the proportion of the number of unsold publication compared 
        to the number of delivered publication.

        Args:
            delivered: List of delivered publications.
            sales: List of sold publications.

        Returns:
            float: The unsold rate.
        """
        return (delivered.sum() - sales.sum()) / delivered.sum()


def sold_out_rate(delivered: pd.Series, sales: pd.Series) -> float:
        """Measures the frequency of sold out cases.
        
        Args:
            delivered: List of delivered publications.
            sales: List of sold publications.

        Returns:
            float: The sold out rate.
        """
        return (delivered == sales).sum() / len(delivered)



In [86]:
 # Baseline
results={}
fourni = test_dataset.Fourni.sum()
ventes = test_dataset.Ventes.sum()
n_dates = test_dataset.Date.nunique() 
results["Réel"]={
    "Fourni": round(fourni / n_dates),
    "Ventes": round(ventes / n_dates),
    "Invendus" : round(unsold_rate(test_dataset.Fourni, test_dataset.Ventes) * 100, 2),
    "Nbr Rupt": round(sold_out_rate(test_dataset.Fourni, test_dataset.Ventes) * 100, 2)
    }
pd.DataFrame(results)

Unnamed: 0,Réel
Fourni,911.0
Invendus,32.61
Nbr Rupt,18.81
Ventes,614.0
