# Bayes Theorem

https://www.khanacademy.org/math/statistics-probability/bayesian-inference/bayes-theorem/a/bayes-theorem

In [1]:
import pandas as pd
import numpy as np

In [29]:
# The data
dico = {
    'party_id': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4],
    'ma_date': ['2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01', '2020-05-01', '2020-06-01',
                '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01', '2020-05-01', '2020-06-01',
                '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01', '2020-05-01', '2020-06-01',
                '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01', '2020-05-01', '2020-06-01'],
    'annee': [2020] * 24,
    'mois': [1, 2, 3, 4, 5, 6] * 4,
    'tran_amt_avec_rate_value': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
}

df = pd.DataFrame(dico)

df['ma_date'] = pd.to_datetime(df['ma_date'], format='%Y-%M-%d')
df.head()

Unnamed: 0,party_id,ma_date,annee,mois,tran_amt_avec_rate_value
0,1,2020-01-01 00:01:00,2020,1,10
1,1,2020-01-01 00:02:00,2020,2,11
2,1,2020-01-01 00:03:00,2020,3,12
3,1,2020-01-01 00:04:00,2020,4,13
4,1,2020-01-01 00:05:00,2020,5,14


In [9]:
def mobile_average(df, window, label_nbr='_nbr', label_sum='_sum', label_avg='_avg'):
    df2 = df.copy()
    df_subset = df2[['party_id', 'ma_date', 'annee', 'mois', 'tran_amt_avec_rate_value']]
    df_subset = df_subset.groupby(['party_id','ma_date','annee','mois'], as_index= False).agg(_nbr = ('party_id', 'count'),
                                                                                              _sum = ('tran_amt_avec_rate_value', 'sum'))
    df2 = df_subset.sort_values(by=['party_id','ma_date'])
    for i in [3]:
        sum_over_the_window = []
        nbr_over_the_window = []

        for identifiant, df in df2.groupby(['party_id']):
            for i, row in df.iterrows():
                date_ech = row['ma_date']
                prev_df = df[ (df['ma_date'] >= (date_ech - pd.DateOffset(months=3))) & (df['ma_date'] <= date_ech) ]
                nbr_over_the_window.append(prev_df['_nbr'].sum())
                sum_over_the_window.append(prev_df['_sum'].sum())
        df2[label_sum] = sum_over_the_window
        df2[label_nbr] = nbr_over_the_window
        df2[label_avg] = df2[label_sum] / df2[label_nbr]
        df2[label_avg] = df2[label_avg].round(1)
    return df2

    

In [30]:
%%time
def mobile_average(df, windows):
    df2 = df.copy()

    # Groupement par 'party_id', 'ma_date', 'annee', 'mois' et agrégation
    df_subset = df2.groupby(['party_id', 'ma_date', 'annee', 'mois'], as_index=False).agg(
        _nbr=('party_id', 'count'),
        _sum=('tran_amt_avec_rate_value', 'sum')
    )

    # Tri des données par 'party_id' et 'ma_date'
    df_subset = df_subset.sort_values(by=['party_id', 'ma_date'])

    # Calcul des moyennes mobiles pour chaque fenêtre
    for window in windows:
        sum_over_the_window = []
        nbr_over_the_window = []

        for identifiant, group in df_subset.groupby(['party_id']):
            for i, row in group.iterrows():
                date_ech = row['ma_date']
                prev_df = group[(group['ma_date'] >= (date_ech - pd.DateOffset(months=window-1))) &
                               (group['ma_date'] <= date_ech)]
                nbr_over_the_window.append(prev_df['_nbr'].sum())
                sum_over_the_window.append(prev_df['_sum'].sum())

        df_subset[f'_sum_{window}m'] = sum_over_the_window
        df_subset[f'_nbr_{window}m'] = nbr_over_the_window
        df_subset[f'_avg_{window}m'] = df_subset[f'_sum_{window}m'] / df_subset[f'_nbr_{window}m']
        df_subset[f'_avg_{window}m'] = df_subset[f'_avg_{window}m'].round(1)

    return df_subset

# Appel de la fonction avec des fenêtres de 2, 3 et 6 mois
result = mobile_average(df, windows=[2, 3, 6])
print(result.to_string())


    party_id             ma_date  annee  mois  _nbr  _sum  _sum_2m  _nbr_2m  _avg_2m  _sum_3m  _nbr_3m  _avg_3m  _sum_6m  _nbr_6m  _avg_6m
0          1 2020-01-01 00:01:00   2020     1     1    10       10        1     10.0       10        1     10.0       10        1     10.0
1          1 2020-01-01 00:02:00   2020     2     1    11       21        2     10.5       21        2     10.5       21        2     10.5
2          1 2020-01-01 00:03:00   2020     3     1    12       33        3     11.0       33        3     11.0       33        3     11.0
3          1 2020-01-01 00:04:00   2020     4     1    13       46        4     11.5       46        4     11.5       46        4     11.5
4          1 2020-01-01 00:05:00   2020     5     1    14       60        5     12.0       60        5     12.0       60        5     12.0
5          1 2020-01-01 00:06:00   2020     6     1    15       75        6     12.5       75        6     12.5       75        6     12.5
6          2 2020-01-01 00:

In [31]:
%%time
def mobile_average(df, windows):
    df2 = df.copy()

    # Groupement par 'party_id', 'ma_date', 'annee', 'mois' et agrégation
    df_subset = df2.groupby(['party_id', 'ma_date', 'annee', 'mois'], as_index=False).agg(
        _nbr=('party_id', 'count'),
        _sum=('tran_amt_avec_rate_value', 'sum')
    )

    # Tri des données par 'party_id' et 'ma_date'
    df_subset = df_subset.sort_values(by=['party_id', 'ma_date'])

    # Calcul des moyennes mobiles pour chaque fenêtre
    for window in windows:
        df_subset[f'_sum_{window}m'] = df_subset.groupby('party_id')['_sum'].rolling(window, min_periods=1).sum().reset_index(0, drop=True)
        df_subset[f'_nbr_{window}m'] = df_subset.groupby('party_id')['_nbr'].rolling(window, min_periods=1).sum().reset_index(0, drop=True)
        df_subset[f'_avg_{window}m'] = df_subset[f'_sum_{window}m'] / df_subset[f'_nbr_{window}m']
        df_subset[f'_avg_{window}m'] = df_subset[f'_avg_{window}m'].round(1)

    return df_subset

# Appel de la fonction avec des fenêtres de 2, 3 et 6 mois
result = mobile_average(df, windows=[2, 3, 6])
print(result.to_string())

    party_id             ma_date  annee  mois  _nbr  _sum  _sum_2m  _nbr_2m  _avg_2m  _sum_3m  _nbr_3m  _avg_3m  _sum_6m  _nbr_6m  _avg_6m
0          1 2020-01-01 00:01:00   2020     1     1    10     10.0      1.0     10.0     10.0      1.0     10.0     10.0      1.0     10.0
1          1 2020-01-01 00:02:00   2020     2     1    11     21.0      2.0     10.5     21.0      2.0     10.5     21.0      2.0     10.5
2          1 2020-01-01 00:03:00   2020     3     1    12     23.0      2.0     11.5     33.0      3.0     11.0     33.0      3.0     11.0
3          1 2020-01-01 00:04:00   2020     4     1    13     25.0      2.0     12.5     36.0      3.0     12.0     46.0      4.0     11.5
4          1 2020-01-01 00:05:00   2020     5     1    14     27.0      2.0     13.5     39.0      3.0     13.0     60.0      5.0     12.0
5          1 2020-01-01 00:06:00   2020     6     1    15     29.0      2.0     14.5     42.0      3.0     14.0     75.0      6.0     12.5
6          2 2020-01-01 00: