# Projet:  
- Réalisation d'une ApplicationDasboard avec Streamlit  
à partir de base de données open source sur les transactions 
de cartes de crédits.

- Lien web vers la source des données:  
[credits_cards_transactions](https://www.kaggle.com/datasets/priyamchoksi/credit-card-transactions-dataset)

## Importation des librairies

- Installation des librairies:  
Exécuter dans la cellule:  %pip install -r requirements.txt  
ou  
Exécuter dans le terminal (prompt ou powershell):  pip install -r requirements.txt

- Importation des librairies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import random
import warnings 
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns",None)
random.seed(123)

## Chargement des données

In [2]:
# Fonction pour charger les données

def load(file:str, size:int = 10**5, sep:str = ','):
    """--Docstring--
    fonction pour charger les données csv
    tout en optimisant la mémoire du système
    avec les fichiers volumineux. L'option
    size permet de charger les données par
    partition.
    Args:
        file: (string, file.csv or path )
        size: taille de la partition (integer, default = 100000)
              si la mémoire est insuffisante pour importer en un
              bloc.
        sep: séparateur (string, default = <,>)
    """

    try:

        data = pd.read_csv(file, sep=sep, index_col=0)
    
    except MemoryError:

        data = pd.read_csv(file, sep=sep, index_col=0,
                            chunksize = size)
    
    data.index.name = 'index'
    
    return data


In [3]:
# Chargement des données
path = "credit_card_transactions.csv"
financial_data = load(path) 

## Exploration des données

### affichage des données

In [4]:
financial_data.head(5)

Unnamed: 0_level_0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,28705.0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,83236.0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,22844.0


### description sur les types de variables

In [5]:
financial_data.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud', 'merch_zipcode'],
      dtype='object')

In [6]:
# Fonction pour décrire le dataframe

def info(data):

    Information = pd.DataFrame({
        'Variables': data.columns,
        'Type': data.dtypes,
        'Unique_values': data.nunique(),
        'NA_counts': data.isna().sum(),
        'NA_percent%':data.isna().mean().round(4)*100
        }).reset_index(drop=True)

    return Information

# Application sur financial_data
info(financial_data)

Unnamed: 0,Variables,Type,Unique_values,NA_counts,NA_percent%
0,trans_date_trans_time,object,1274791,0,0.0
1,cc_num,int64,983,0,0.0
2,merchant,object,693,0,0.0
3,category,object,14,0,0.0
4,amt,float64,52928,0,0.0
5,first,object,352,0,0.0
6,last,object,481,0,0.0
7,gender,object,2,0,0.0
8,street,object,983,0,0.0
9,city,object,894,0,0.0


### mise en forme du type des variables

In [7]:
# fonction pour mettre en forme

def conversion(var):

    # conversion des variables de type = <date>
    date_vars = ('trans_date_trans_time', 'dob')

    if var.name in date_vars:
        
        return pd.to_datetime(var, infer_datetime_format = True)


    # conversion des variables type = <objet>
    if (var.dtype=='object') and (var.name not in date_vars):

        # conversion en string
        if var.nunique() > 20:
            return var.astype('string')

        # conversion en catégorie 
        else:
            return var.astype('category')
    
    else:
        return var 


In [8]:
# appication de la fonction sur les variables 
financial_data = financial_data.apply(conversion, axis = 0)
info(financial_data)

Unnamed: 0,Variables,Type,Unique_values,NA_counts,NA_percent%
0,trans_date_trans_time,datetime64[ns],1274791,0,0.0
1,cc_num,int64,983,0,0.0
2,merchant,string[python],693,0,0.0
3,category,category,14,0,0.0
4,amt,float64,52928,0,0.0
5,first,string[python],352,0,0.0
6,last,string[python],481,0,0.0
7,gender,category,2,0,0.0
8,street,string[python],983,0,0.0
9,city,string[python],894,0,0.0


### Création de nouvelles variables

- Variables dates

In [9]:
# mettre la variable trans_date_... au format 'jan-01-2020'
# avec une nouvelle variable

financial_data['date_transaction'] = pd.to_datetime(financial_data['trans_date_trans_time']
                                                                  .dt.strftime('%d-%b-%Y'))

# Extraire des informations sur la transaction : jour, mois, heure

financial_data['year_transaction'] = (financial_data['trans_date_trans_time']
                                                   .dt.year)

financial_data['day_transaction'] = (financial_data['trans_date_trans_time']
                                                   .dt.day.astype(int))

financial_data['month_transaction'] = (financial_data['trans_date_trans_time']
                                                   .dt.month_name().astype('category'))

financial_data['hour_transaction'] = (financial_data['trans_date_trans_time']
                                                   .dt.strftime('%H.%M')).astype(float)

# Créer une variable moment de la journée à partir de heure_transaction

condlist = [financial_data['hour_transaction'].between(6,12.59),
            financial_data['hour_transaction'].between(13,18.59),
            financial_data['hour_transaction'].between(19,23.59),
            financial_data['hour_transaction'].between(0,5.59)
            ]
choicelist = ['Morning: 6h-12h59',
              'Afternoon: 13h-18h59',
              'Evening: 19h-23h59',
              'Night: 00h-5h59'
              ]

financial_data['moment_transaction'] = np.select(condlist, choicelist, default=None)
financial_data['moment_transaction'] = financial_data['moment_transaction'].astype('category')

- Variables Âge, Nom

In [10]:
# Définir la date de référence pour le calcul de l'âge
reference_date = pd.to_datetime('2020-12-31')

# Calcul de l'âge de l'individu à partir de la date de naissance (dob) 
financial_data['age'] = ((reference_date - financial_data['dob']).dt.days/365).astype('int')

# Création de la variable groupe d'âge
financial_data['group_age']=pd.cut(financial_data['age'],
                                   bins=[14,31,46,60,100],
                                   labels=['15-30 ans', '31-45 ans','46-60 ans' ,'+60 ans']
                                  ).astype('category')

# Concaténer first(prénom) et last(nom_de_famille) pour avoir
# nom complet
financial_data['fullname'] = (financial_data['first'] +' '+ financial_data['last']).astype('string')

In [11]:
info(financial_data)

Unnamed: 0,Variables,Type,Unique_values,NA_counts,NA_percent%
0,trans_date_trans_time,datetime64[ns],1274791,0,0.0
1,cc_num,int64,983,0,0.0
2,merchant,string[python],693,0,0.0
3,category,category,14,0,0.0
4,amt,float64,52928,0,0.0
5,first,string[python],352,0,0.0
6,last,string[python],481,0,0.0
7,gender,category,2,0,0.0
8,street,string[python],983,0,0.0
9,city,string[python],894,0,0.0


### reorganisation des colonnes et exportation des données finales

In [12]:
# Réorganisation des columns

financial_data.rename(columns={'category':'business_sector',
                               'cc_num':'card_number',
                               'amt':'amount_USD'}, inplace = True)

list_vars = ['card_number', 'date_transaction','year_transaction' ,'month_transaction',
             'day_transaction', 'hour_transaction', 'moment_transaction','amount_USD',
             'fullname', 'gender', 'age', 'group_age', 'job', 'business_sector',
             'city', 'city_pop', 'state', 'lat', 'long', 'is_fraud', 'trans_num']

financial_data[list_vars].sort_values(by = 'date_transaction',
                                      ascending = True,
                                      ignore_index=True).head(25)

Unnamed: 0,card_number,date_transaction,year_transaction,month_transaction,day_transaction,hour_transaction,moment_transaction,amount_USD,fullname,gender,age,group_age,job,business_sector,city,city_pop,state,lat,long,is_fraud,trans_num
0,2703186189652095,2019-01-01,2019,January,1,0.0,Night: 00h-5h59,4.97,Jennifer Banks,F,32,31-45 ans,"Psychologist, counselling",misc_net,Moravian Falls,3495,NC,36.0788,-81.1781,0,0b242abb623afc578575680df30655b9
1,342952484382519,2019-01-01,2019,January,1,17.14,Afternoon: 13h-18h59,8.84,Kayla Jones,F,33,31-45 ans,Comptroller,personal_care,East Canaan,647,CT,42.0158,-73.2913,0,6f51efaef1d8e05c3d2d961de13686c0
2,3585740823295298,2019-01-01,2019,January,1,17.14,Afternoon: 13h-18h59,36.07,James Greene,M,22,15-30 ans,"Librarian, public",home,Quanah,3202,TX,34.2956,-99.7494,0,4a36e33771c09dbf34a851c8c455ae4f
3,2356276337669917,2019-01-01,2019,January,1,17.15,Afternoon: 13h-18h59,9.83,Benjamin Harris,M,39,31-45 ans,Paediatric nurse,shopping_pos,Chester,5848,MD,38.9583,-76.2842,0,c2c6ba3b07a2cd8ef0dc27e444be6b9e
4,4740713119940984,2019-01-01,2019,January,1,17.16,Afternoon: 13h-18h59,17.69,Heather Hines,F,58,46-60 ans,Pensions consultant,personal_care,Pomona,9993,NY,41.1901,-74.0436,0,f913f178b848eff06765fa20a694c296
5,3502377050801561,2019-01-01,2019,January,1,17.16,Afternoon: 13h-18h59,120.8,Kathleen Martin,F,72,+60 ans,"Scientist, biomedical",kids_pets,New Waverly,4993,TX,30.5354,-95.4532,0,cc4052b5536ad301eb62adfb67b9b81e
6,4342532437704183,2019-01-01,2019,January,1,17.16,Afternoon: 13h-18h59,8.66,Kevin Elliott,M,26,15-30 ans,Further education lecturer,shopping_pos,Newark Valley,4354,NY,42.2281,-76.1625,0,cb8cd75b0d6c3bf7dc49c52e7a7b0f86
7,4239436242473,2019-01-01,2019,January,1,17.14,Afternoon: 13h-18h59,8.7,James Leon,M,29,15-30 ans,"Scientist, biomedical",shopping_pos,Alexandria,321490,VA,38.832,-77.12,0,b62b3d7920575944cc5c8182118692ce
8,370612217861404,2019-01-01,2019,January,1,17.17,Afternoon: 13h-18h59,238.29,Amanda Fitzgerald,F,35,31-45 ans,"Administrator, charities/voluntary organisations",home,Bristol,21125,PA,40.1159,-74.8536,0,dc2ebc70888063d67b849ef4bb0e518c
9,3533012926413100,2019-01-01,2019,January,1,17.19,Afternoon: 13h-18h59,4.08,Megan Bernard,F,26,15-30 ans,IT trainer,misc_pos,Barneveld,1684,NY,43.2237,-75.1612,0,848597a840aec719c061f937a5ca5240


## Analyse Statistique des données

- fonction pour tableau Croisé de Statistiques descriptives

In [None]:
def cross_stat(dataframe:pd.DataFrame, catvar_list: list,
               numvar_list:list):
    """--Docstring--
    fonction permettant de faire un tableau statistique
    croisé entre une variable continue et des catégories
    Args:
        dataframe: le dataframe
        catvar_list: (cat)liste des variables catégorielles
        numvar_list: list des variables continues
                     (peut être une seule variable: [var])
    """
    
    # Initialisation d'un dataframe vide
    data = pd.DataFrame()

    # check list pour les variables
    check_list = catvar_list + numvar_list

    # statistiques calculées
    stats = ['min','max','mean', 'std', 'count']
    quartile = [0.25,0.50,0.75]

    # vérification de la présence de chaque variable
    # dans le dataframe
    
    for var in check_list:    
        
        if var not in dataframe.columns:
            raise ValueError(f'{var} not in data.columns')

    for var in catvar_list:
        
        condition1 = (dataframe[var].dtype not in ('category','object') )
        condition2 = (dataframe[var].nunique > 10)      
        
        if condition1 and condition2:
            raise TypeError(f'{var} is not type category or category_length>10')

    for var in numvar_list:
        
        if dataframe[var].dtype != numeric:
            raise TypeError(f'{var} is not type numeric')
    
        # tables statistiques
        table_1 = dataframe.groupby()
tbl_vitesse = df.groupby('sexe').agg({var: stats for var in list_vitesse})
   table_1 = data_melted.groupby('Categories')[score].agg(['min', 'max', 'mean', 'median', 'var','std'])
        table_1['obs']=obs

        #Les distributions de percentiles 1% 25% 50% 75% 99%
        ta

    return summary_table.T  # Transposer pour avoir les variables en ligne

In [None]:
def Cross_summary(data,targets_list,features_list):
    
    #librairie pour transformer une liste imbriquée en liste simple
    from itertools import chain
    
    # Validation des éléments d'entrée
    if not isinstance(data, pd.DataFrame):
        raise TypeError('data is not a pandas.DataFrame')
    
    for var in targets_list: # Variables continues ou numeriques
        if not (var in data.columns and data[var].dtype in ['float','numeric']):
            raise ValueError(f'{var} not in data.columns or not in dtype.["float","numeric"]')
    
    for var in features_list: # Variables catégorielles
        if not (var in data.columns and data[var].dtype in ['category']):
            raise ValueError(f'{var} not in data.columns or not in dtype.["category"]')

    labels = [data[var].unique() for var in features_list]
    labels = list(chain.from_iterable(labels)) # transformer les listes imbriquées en une liste unique
    a, b = labels.index('31-40 ans'), labels.index('41-50 ans')
    labels[a], labels[b]=labels[b],labels[a] # intervertir les places de deux labels 

    #Création d'un dictionnaire de mapping entre les labels et leur variable 
    Group_index={}
     
    for var in features_list:
        Group_index.update({label:f'{var}' for label in data[var].unique()})

    for score in targets_list:
        # Fusionner les catégories dans une colonne avec 'melt'
        data_melted = data.melt(id_vars=score, value_vars=features_list,
                          var_name=f'Statistiques: {score}', value_name='Categories')
    
        #Nombres d'observations par catégories
        obs=data_melted.groupby('Categories').size()
    
        # Calculer les statistiques descriptives pour chaque catégorie
        table_1 = data_melted.groupby('Categories')[score].agg(['min', 'max', 'mean', 'median', 'var','std'])
        table_1['obs']=obs

        #Les distributions de percentiles 1% 25% 50% 75% 99%
        table_2 = data_melted.groupby('Categories')[score].quantile([0.01,0.25,0.50,0.75,0.99]).unstack()
        table_2['obs']=obs
    
        # Renommer les colonnes
        table_1.columns = ['Minimum', 'Maximum', 'Mean', 'Median', 'Variance',
                       'Standart-deviation',"Observations par Categories"]
        table_2.columns = ['Percentile 1%','Percentile 25%', 'Percentile 50%', 
                       'Percentile 75%','Percentile 99%',"Observations par categories"]
    
        #Trier les valeurs des catégories 
        table_1.index=pd.Categorical(table_1.index, categories=labels, ordered=True)
        table_1=table_1.sort_index()
        table_2.index=pd.Categorical(table_2.index, categories=labels, ordered=True)
        table_2=table_2.sort_index()

        # Créer une nouvelle colonne qui contient les groupes en utilisant le mapping
        table_1['Caractéristiques'] = table_1.index.map(Group_index)
        table_2['Caractéristiques'] = table_2.index.map(Group_index)

        # Maintenant, créer un multi-index en utilisant le groupe et les index existants
        table_1 = table_1.set_index(['Caractéristiques', table_1.index])
        table_2 = table_2.set_index(['Caractéristiques', table_2.index])

        #Enregistrer les tables dans les dictionnaires correspondants
         print(f"TABLEAU STATISTIQUE DE {score} PAR CARACTERISTIQUES SOCIO-DEMOGRAPHIQUES")
        display(table_1)
        print(f"\n")
        print(f"TABLEAU DES DECILES DE {score} PAR CARACTERISTIQUES SOCIO-DEMOGRAPHIQUES")
        display(table_2)