<!--
 Copyright 2025 beedi.goua_square-ma
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
     https://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-->



In [1]:
# 1. Imports et configuration
from pathlib import Path
import sys
base_dir = Path().resolve().parent
sys.path.append(str(base_dir))
base_dir


WindowsPath('C:/Users/beedi.goua_square-ma/Desktop/Gheb/projet perso/budget-ai-dashboard')

In [2]:
# 2. Affichage du répertoire de base
print(f"Répertoire de base : {base_dir}")
# 3. Vérification de l'existence du répertoire de base
print(base_dir.exists())

Répertoire de base : C:\Users\beedi.goua_square-ma\Desktop\Gheb\projet perso\budget-ai-dashboard
True


In [3]:
# 4. Ajout des chemins nécessaires au sys.path

sys.path.append(str(base_dir / "src"))
sys.path.append(str(base_dir / "data"))
sys.path.append(str(base_dir / "notebooks"))

In [4]:
# 5. Import des modules nécessaires
import pandas as pd
from src.features_engineering import (
    extract_date_features, encode_day_of_week_cyclic,
    encode_hour_cyclic, create_transaction_flags,
    calculate_saving_rate, clean_description,
    categorize_amount_level, tag_fixed_expenses,
    standardize_columns,
    encode_transaction_type,
    add_zscore_flags,
    create_prompt_field
)

    




In [5]:
# 6. Chargement des données
data_dir = base_dir / "data"
df_path = data_dir / "transactions.csv"
df = pd.read_csv(df_path)


In [6]:
# analyse des données
print(f"Nombre de lignes dans le DataFrame : {len(df)}")
print(f"Colonnes du DataFrame : {df.columns.tolist()}")
# 7. Affichage des premières lignes du DataFrame
print("Premières lignes du DataFrame :")
print(df.head())


Nombre de lignes dans le DataFrame : 472
Colonnes du DataFrame : ['Transaction Date', 'Post Date', 'Description', 'Category', 'Type', 'Amount', 'Memo']
Premières lignes du DataFrame :
  Transaction Date   Post Date            Description      Category  Type  \
0       12/24/2023  12/24/2023        CHIPOTLE ONLINE  Food & Drink  Sale   
1       12/22/2023  12/24/2023        BRAVO-DEERFIELD  Food & Drink  Sale   
2       12/22/2023  12/24/2023        MARSHALLS #1172      Shopping  Sale   
3       12/22/2023  12/24/2023  DD DOORDASH WINGSANDR  Food & Drink  Sale   
4       12/21/2023  12/24/2023                 KROGER     Groceries  Sale   

   Amount  Memo  
0  -33.17   NaN  
1  -50.17   NaN  
2  -49.06   NaN  
3  -62.31   NaN  
4 -131.72   NaN  


In [7]:
df

Unnamed: 0,Transaction Date,Post Date,Description,Category,Type,Amount,Memo
0,12/24/2023,12/24/2023,CHIPOTLE ONLINE,Food & Drink,Sale,-33.17,
1,12/22/2023,12/24/2023,BRAVO-DEERFIELD,Food & Drink,Sale,-50.17,
2,12/22/2023,12/24/2023,MARSHALLS #1172,Shopping,Sale,-49.06,
3,12/22/2023,12/24/2023,DD DOORDASH WINGSANDR,Food & Drink,Sale,-62.31,
4,12/21/2023,12/24/2023,KROGER,Groceries,Sale,-131.72,
...,...,...,...,...,...,...,...
467,9/26/2023,9/26/2023,Disney Plus,Bills & Utilities,Sale,-5.76,
468,9/25/2023,9/26/2023,THE HONEST COMPANY INC.,Shopping,Sale,-11.87,
469,9/23/2023,9/25/2023,McDonalds 39386,Food & Drink,Sale,-1.38,
470,9/24/2023,9/24/2023,CHIPOTLE ONLINE,Food & Drink,Sale,-65.73,


## Resampling
Notre but est d **augmenter artificiellement ton nombre d'observations** (passer d’environ 470 à 2000 lignes) tout en **respectant les structures importantes** de ton jeu de données, notamment :

* la répartition des **catégories** de dépenses,
* le **type** d’opération (achat, crédit, etc.),
* la **répartition temporelle** (mois, dates),
* la **variété dans les lignes** (éviter de dupliquer naïvement les mêmes lignes plusieurs fois).

### Méthode utilisée

1. **Stratification** :
   on a créé une clé de regroupement (`StratKey`) combinant les colonnes `Category`, `Type` et `Month`. Cette clé te permet de **préserver la distribution des types de transactions dans le temps**. L’échantillonnage se fait ensuite dans chaque groupe, à une fréquence proportionnelle à sa taille d’origine. Cela garantit une représentation équilibrée de chaque type de transaction.

2. **Resampling intelligent** :
   Pour chaque groupe, on effectue un échantillonnage :

   * **avec remise** si le groupe est trop petit,
   * **sans remise** sinon.
     Cela assure que **chaque catégorie existe en proportion réaliste** dans le nouveau DataFrame.

3. **Mélange et réattribution de dates** :
   Après avoir concaténé les échantillons, on  mélange les lignes pour éviter les répétitions visibles, puis on **réattribues des dates réalistes**. Cette réattribution respecte la fréquence originale des dates : par exemple, si beaucoup de transactions ont eu lieu en décembre, le nouveau dataset reflétera cette tendance. on évite ainsi les biais temporels artificiels.

---

### Avantages de ta méthode

* **Préservation des proportions réelles** entre catégories et types de transaction.
* **Répartition temporelle cohérente**, sans créer de "pics" artificiels.
* **Variété et réalisme** des données augmentées, propices à l’apprentissage machine ou à la génération de dashboards fiables.



In [8]:
df = df.copy()
df['Date'] = pd.to_datetime(df['Transaction Date'])

# Ajout de colonnes temporelles
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year

# On crée une clé de stratification
df['StratKey'] = (
    df['Category'].astype(str) + "_" +
    df['Type'].astype(str) + "_" +
    df['Month'].astype(str)
)


In [9]:

# Vérifions les valeurs manquantes dans la colonne 'Category'
missing_categories = df['Category'].isna().sum()

# Supprimons les lignes sans catégorie pour pouvoir faire l'échantillonnage stratifié
df = df.dropna(subset=['Category'])


In [10]:

from sklearn.utils import resample
import numpy as np
# 8. Échantillonnage stratifié
# Échantillonnage stratifié pour obtenir un DataFrame de taille désirée
desired_size = 2000
multiplier = int(np.ceil(desired_size / len(df)))




In [11]:

resampled_df = pd.DataFrame()

# Répéter chaque groupe selon sa fréquence relative
for key, group in df.groupby('StratKey'):
    proportion = len(group) / len(df)
    target_rows = int(desired_size * proportion)

    # Si le groupe est petit, on fait un tirage avec remise
    sampled = group.sample(n=target_rows, replace=(len(group) < target_rows), random_state=42)
    resampled_df = pd.concat([resampled_df, sampled], ignore_index=True)


In [12]:

# Réinitialisation de l'index
resampled_df = resampled_df.sample(frac=1, random_state=42).reset_index(drop=True)
# Réattribution des dates
dates_freq = df['Transaction Date'].value_counts(normalize=True)
new_dates = np.random.choice(dates_freq.index, size=len(resampled_df), p=dates_freq.values)

# Conversion des nouvelles dates en datetime et réorganisation du DataFrame
resampled_df['Transaction Date'] = pd.to_datetime(new_dates)
resampled_df = resampled_df.sort_values("Transaction Date").reset_index(drop=True)


In [13]:
print("Ancienne répartition :")
print(df['StratKey'].value_counts(normalize=True).head())

print("Nouvelle répartition :")
print(resampled_df['StratKey'].value_counts(normalize=True).head())

print("Taille finale :", len(resampled_df))


Ancienne répartition :
StratKey
Food & Drink_Sale_10    0.135484
Food & Drink_Sale_11    0.113978
Shopping_Sale_11        0.113978
Food & Drink_Sale_12    0.088172
Shopping_Sale_12        0.086022
Name: proportion, dtype: float64
Nouvelle répartition :
StratKey
Food & Drink_Sale_10    0.136364
Food & Drink_Sale_11    0.114646
Shopping_Sale_11        0.114646
Food & Drink_Sale_12    0.088889
Shopping_Sale_12        0.086869
Name: proportion, dtype: float64
Taille finale : 1980


In [14]:
resampled_df.drop(columns=['StratKey', 'Month', 'Year'], inplace=True)


In [15]:
df

Unnamed: 0,Transaction Date,Post Date,Description,Category,Type,Amount,Memo,Date,Month,Year,StratKey
0,12/24/2023,12/24/2023,CHIPOTLE ONLINE,Food & Drink,Sale,-33.17,,2023-12-24,12,2023,Food & Drink_Sale_12
1,12/22/2023,12/24/2023,BRAVO-DEERFIELD,Food & Drink,Sale,-50.17,,2023-12-22,12,2023,Food & Drink_Sale_12
2,12/22/2023,12/24/2023,MARSHALLS #1172,Shopping,Sale,-49.06,,2023-12-22,12,2023,Shopping_Sale_12
3,12/22/2023,12/24/2023,DD DOORDASH WINGSANDR,Food & Drink,Sale,-62.31,,2023-12-22,12,2023,Food & Drink_Sale_12
4,12/21/2023,12/24/2023,KROGER,Groceries,Sale,-131.72,,2023-12-21,12,2023,Groceries_Sale_12
...,...,...,...,...,...,...,...,...,...,...,...
467,9/26/2023,9/26/2023,Disney Plus,Bills & Utilities,Sale,-5.76,,2023-09-26,9,2023,Bills & Utilities_Sale_9
468,9/25/2023,9/26/2023,THE HONEST COMPANY INC.,Shopping,Sale,-11.87,,2023-09-25,9,2023,Shopping_Sale_9
469,9/23/2023,9/25/2023,McDonalds 39386,Food & Drink,Sale,-1.38,,2023-09-23,9,2023,Food & Drink_Sale_9
470,9/24/2023,9/24/2023,CHIPOTLE ONLINE,Food & Drink,Sale,-65.73,,2023-09-24,9,2023,Food & Drink_Sale_9


In [16]:
resampled_df

Unnamed: 0,Transaction Date,Post Date,Description,Category,Type,Amount,Memo,Date
0,2023-09-23,11/19/2023,CHIPOTLE ONLINE,Food & Drink,Sale,-58.52,,2023-11-19
1,2023-09-23,10/29/2023,CIRCLE K # 04893,Gas,Sale,-7.62,,2023-10-28
2,2023-09-23,10/8/2023,DD DOORDASH MODPIZZA,Food & Drink,Sale,-26.40,,2023-10-07
3,2023-09-23,10/22/2023,WENDYS 644,Food & Drink,Sale,-25.76,,2023-10-20
4,2023-09-24,12/5/2023,KROGER,Groceries,Sale,-43.97,,2023-12-04
...,...,...,...,...,...,...,...,...
1975,2023-12-23,10/3/2023,PANERA BREAD #204922 O,Food & Drink,Sale,-10.41,,2023-10-03
1976,2023-12-23,12/13/2023,LOVELAND KIDS CHILDCARE,Personal,Sale,-305.00,,2023-12-11
1977,2023-12-24,12/24/2023,HOBBY-LOBBY #0219,Shopping,Sale,-48.65,,2023-12-22
1978,2023-12-24,12/19/2023,TARGET 00024869,Shopping,Sale,-2.67,,2023-12-18


In [17]:
# Ancienne
print(df['Category'].value_counts(normalize=True).sort_index())

# Nouvelle
print(resampled_df['Category'].value_counts(normalize=True).sort_index())


Category
Automotive            0.006452
Bills & Utilities     0.015054
Education             0.002151
Entertainment         0.017204
Fees & Adjustments    0.019355
Food & Drink          0.363441
Gas                   0.118280
Groceries             0.079570
Health & Wellness     0.015054
Personal              0.038710
Shopping              0.303226
Travel                0.021505
Name: proportion, dtype: float64
Category
Automotive            0.006061
Bills & Utilities     0.014141
Education             0.002020
Entertainment         0.017172
Fees & Adjustments    0.018687
Food & Drink          0.365657
Gas                   0.118687
Groceries             0.078788
Health & Wellness     0.014646
Personal              0.038384
Shopping              0.305051
Travel                0.020707
Name: proportion, dtype: float64


<!--
 Copyright 2025 beedi.goua_square-ma
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
     https://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-->



In [18]:
"""for col in df.columns:
    if col in resampled_df.columns:
        if set(df[col].dropna().unique()) != set(resampled_df[col].dropna().unique()):
            print(f"Changement détecté dans la colonne '{col}'")
            

"""

'for col in df.columns:\n    if col in resampled_df.columns:\n        if set(df[col].dropna().unique()) != set(resampled_df[col].dropna().unique()):\n            print(f"Changement détecté dans la colonne \'{col}\'")\n\n\n'

In [19]:

"""for col in df.columns:
    if col in resampled_df.columns and df[col].dtype == 'object':
        orig_dist = df[col].value_counts(normalize=True)
        new_dist = resampled_df[col].value_counts(normalize=True)
        combined_index = orig_dist.index.union(new_dist.index)
        diff = (orig_dist.reindex(combined_index, fill_value=0) - new_dist.reindex(combined_index, fill_value=0)).abs()
        if diff.max() > 0.05:  # seuil de tolérance
            print(f"Distribution changée pour la colonne '{col}' (diff max : {diff.max():.2f})")
"""

'for col in df.columns:\n    if col in resampled_df.columns and df[col].dtype == \'object\':\n        orig_dist = df[col].value_counts(normalize=True)\n        new_dist = resampled_df[col].value_counts(normalize=True)\n        combined_index = orig_dist.index.union(new_dist.index)\n        diff = (orig_dist.reindex(combined_index, fill_value=0) - new_dist.reindex(combined_index, fill_value=0)).abs()\n        if diff.max() > 0.05:  # seuil de tolérance\n            print(f"Distribution changée pour la colonne \'{col}\' (diff max : {diff.max():.2f})")\n'

In [20]:
df["Category"].unique()

array(['Food & Drink', 'Shopping', 'Groceries', 'Gas', 'Automotive',
       'Health & Wellness', 'Personal', 'Travel', 'Bills & Utilities',
       'Fees & Adjustments', 'Entertainment', 'Education'], dtype=object)

In [21]:


data = resampled_df.copy()
#7. Traitement des données
data = extract_date_features(data)
data = encode_day_of_week_cyclic(data)
data = encode_hour_cyclic(data)
data = clean_description(data)
data = create_transaction_flags(data)
data = categorize_amount_level(data)
data = tag_fixed_expenses(data)

# 8. Calcul du taux d'épargne
data = standardize_columns(data)
data = encode_transaction_type(data)
data = add_zscore_flags(data)
data = create_prompt_field(data)


In [22]:
df.columns

Index(['Transaction Date', 'Post Date', 'Description', 'Category', 'Type',
       'Amount', 'Memo', 'Date', 'Month', 'Year', 'StratKey'],
      dtype='object')

In [23]:
data.columns

Index(['Transaction Date', 'Post Date', 'Description', 'Category', 'Type',
       'Amount', 'Memo', 'Date', 'Year', 'Month', 'Week', 'Day', 'Hour',
       'DayOfWeek', 'IsWeekend', 'IsMonthEnd', 'DayOfWeek_sin',
       'DayOfWeek_cos', 'Hour_sin', 'Hour_cos', 'Cleaned_Description',
       'IsLargeAmount', 'IsVeryLargeAmount', 'AmountLevel', 'IsRecurring',
       'Type_Adjustment', 'Type_Fee', 'Type_Return', 'Type_Sale',
       'ZScore_Amount', 'IsOutlier_Z', 'PromptText'],
      dtype='object')

In [24]:

# 4. Calcul taux d’épargne
monthly_saving = calculate_saving_rate(data)



In [25]:

# 5. Vérification
display(data.head())
display(monthly_saving)



Unnamed: 0,Transaction Date,Post Date,Description,Category,Type,Amount,Memo,Date,Year,Month,...,IsVeryLargeAmount,AmountLevel,IsRecurring,Type_Adjustment,Type_Fee,Type_Return,Type_Sale,ZScore_Amount,IsOutlier_Z,PromptText
0,2023-09-23,11/19/2023,CHIPOTLE ONLINE,Food & Drink,sale,-58.52,,2023-11-19,2023,11,...,0,small,1,False,False,False,True,-0.243603,0,"Transaction de -58.52€ chez chipotle online, d..."
1,2023-09-23,10/29/2023,CIRCLE K # 04893,Gas,sale,-7.62,,2023-10-28,2023,10,...,0,very_large,1,False,False,False,True,0.513507,0,"Transaction de -7.62€ chez circle k 04893, de..."
2,2023-09-23,10/8/2023,DD DOORDASH MODPIZZA,Food & Drink,sale,-26.4,,2023-10-07,2023,10,...,0,medium,0,False,False,False,True,0.234165,0,Transaction de -26.4€ chez dd doordash modpizz...
3,2023-09-23,10/22/2023,WENDYS 644,Food & Drink,sale,-25.76,,2023-10-20,2023,10,...,0,medium,0,False,False,False,True,0.243684,0,"Transaction de -25.76€ chez wendys 644, de typ..."
4,2023-09-24,12/5/2023,KROGER,Groceries,sale,-43.97,,2023-12-04,2023,12,...,0,medium,1,False,False,False,True,-0.02718,0,"Transaction de -43.97€ chez kroger, de type Sa..."


Type,Year,Month,adjustment,fee,return,sale,SavingRate
0,2023,9,0.0,0.0,0.0,-6477.01,0.0
1,2023,10,1200.0,-196.0,25.52,-30945.6,0.0
2,2023,11,104.45,0.0,811.61,-30662.98,0.0
3,2023,12,91.54,0.0,0.0,-17394.15,0.0


In [26]:
print(data_dir.exists())


True


In [27]:
# 6. Sauvegarde
output_dir = base_dir / "outputs"
output_dir.mkdir(parents=True, exist_ok=True)
sys.path.append(str(output_dir))
print(output_dir.exists())


True


In [28]:
data.to_csv(output_dir / "transactions_enriched.csv", index=False)
monthly_saving.to_csv(output_dir / "monthly_saving.csv", index=False)