In [26]:
# Import necessary libraries
import pandas as pd

# Load your dataset (replace 'your_data.csv' with your actual file)
df = pd.read_csv('Resources/cafc-open-gouv-database-2021-01-01-to-2025-03-31-extracted-2025-04-01.csv')

In [27]:
df.columns

Index(['Numéro d'identification / Number ID', 'Date Received / Date reçue',
       'Complaint Received Type', 'Type de plainte reçue', 'Country', 'Pays',
       'Province/State', 'Province/État',
       'Fraud and Cybercrime Thematic Categories',
       'Catégories thématiques sur la fraude et la cybercriminalité',
       'Solicitation Method', 'Méthode de sollicitation', 'Gender', 'Genre',
       'Language of Correspondence', 'Langue de correspondance',
       'Victim Age Range / Tranche d'âge des victimes', 'Complaint Type',
       'Type de plainte', 'Number of Victims / Nombre de victimes',
       'Dollar Loss /pertes financières'],
      dtype='object')

In [28]:
df.dtypes

Numéro d'identification / Number ID                             int64
Date Received / Date reçue                                     object
Complaint Received Type                                        object
Type de plainte reçue                                          object
Country                                                        object
Pays                                                           object
Province/State                                                 object
Province/État                                                  object
Fraud and Cybercrime Thematic Categories                       object
Catégories thématiques sur la fraude et la cybercriminalité    object
Solicitation Method                                            object
Méthode de sollicitation                                       object
Gender                                                         object
Genre                                                          object
Language of Correspo

In [29]:
# Drop unnecessary columns
df = df.drop(columns=[
    'Type de plainte reçue', 'Pays', 'Province/État',
    'Catégories thématiques sur la fraude et la cybercriminalité',
    'Méthode de sollicitation', 'Genre', 'Langue de correspondance', 'Type de plainte'
])

In [30]:
df['Dollar Loss /pertes financières'] = (
    df['Dollar Loss /pertes financières']
    .replace('[$,]', '', regex=True)  # No need for backslash before $
    .astype(float)
)

In [31]:
# Fill NA values with 'Unknown'
df = df.fillna('Unknown')

# Display the first few rows of the cleaned dataset
df.head()

Unnamed: 0,Numéro d'identification / Number ID,Date Received / Date reçue,Complaint Received Type,Country,Province/State,Fraud and Cybercrime Thematic Categories,Solicitation Method,Gender,Language of Correspondence,Victim Age Range / Tranche d'âge des victimes,Complaint Type,Number of Victims / Nombre de victimes,Dollar Loss /pertes financières
0,1,2021-01-02,CAFC Website,Canada,Saskatchewan,Merchandise,Other/unknown,Not Available,Not Available,'Not Available / non disponible,Attempt,0,0.0
1,2,2021-01-02,CAFC Website,Not Specified,Not Specified,Merchandise,Internet,Not Available,Not Available,'Not Available / non disponible,Victim,1,1000.0
2,3,2021-01-02,CAFC Website,Canada,Quebec,Identity Fraud,Other/unknown,Male,French,'40 - 49,Victim,1,0.0
3,4,2021-01-02,CAFC Website,Canada,Saskatchewan,Phishing,Email,Male,English,'30 - 39,Victim,1,0.0
4,5,2021-01-02,CAFC Website,Canada,Saskatchewan,Merchandise,Other/unknown,Male,Not Available,'60 - 69,Victim,1,222.73


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328649 entries, 0 to 328648
Data columns (total 13 columns):
 #   Column                                         Non-Null Count   Dtype  
---  ------                                         --------------   -----  
 0   Numéro d'identification / Number ID            328649 non-null  int64  
 1   Date Received / Date reçue                     328649 non-null  object 
 2   Complaint Received Type                        328649 non-null  object 
 3   Country                                        328649 non-null  object 
 4   Province/State                                 328649 non-null  object 
 5   Fraud and Cybercrime Thematic Categories       328649 non-null  object 
 6   Solicitation Method                            328649 non-null  object 
 7   Gender                                         328649 non-null  object 
 8   Language of Correspondence                     328649 non-null  object 
 9   Victim Age Range / Tranche d'âge des 

In [33]:
df = df.drop(df[df['Complaint Type'] == 'Incomplete'].index)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 328176 entries, 0 to 328648
Data columns (total 13 columns):
 #   Column                                         Non-Null Count   Dtype  
---  ------                                         --------------   -----  
 0   Numéro d'identification / Number ID            328176 non-null  int64  
 1   Date Received / Date reçue                     328176 non-null  object 
 2   Complaint Received Type                        328176 non-null  object 
 3   Country                                        328176 non-null  object 
 4   Province/State                                 328176 non-null  object 
 5   Fraud and Cybercrime Thematic Categories       328176 non-null  object 
 6   Solicitation Method                            328176 non-null  object 
 7   Gender                                         328176 non-null  object 
 8   Language of Correspondence                     328176 non-null  object 
 9   Victim Age Range / Tranche d'âge des victi

In [35]:
df = df.drop(df[(df['Complaint Type'] == 'Unknown') & 
                (df['Fraud and Cybercrime Thematic Categories'] == 'Unknown')].index)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 327560 entries, 0 to 328648
Data columns (total 13 columns):
 #   Column                                         Non-Null Count   Dtype  
---  ------                                         --------------   -----  
 0   Numéro d'identification / Number ID            327560 non-null  int64  
 1   Date Received / Date reçue                     327560 non-null  object 
 2   Complaint Received Type                        327560 non-null  object 
 3   Country                                        327560 non-null  object 
 4   Province/State                                 327560 non-null  object 
 5   Fraud and Cybercrime Thematic Categories       327560 non-null  object 
 6   Solicitation Method                            327560 non-null  object 
 7   Gender                                         327560 non-null  object 
 8   Language of Correspondence                     327560 non-null  object 
 9   Victim Age Range / Tranche d'âge des victi

In [37]:
df = df.rename(columns={
    "Numéro d'identification / Number ID": "Number ID",
    "Date Received / Date reçue": "Date Received",
    "Victim Age Range / Tranche d'âge des victimes": "Victim Age Range",
    "Number of Victims / Nombre de victimes": "Number of Victims",
    "Dollar Loss /pertes financières": "Dollar Loss"    
})


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 327560 entries, 0 to 328648
Data columns (total 13 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   Number ID                                 327560 non-null  int64  
 1   Date Received                             327560 non-null  object 
 2   Complaint Received Type                   327560 non-null  object 
 3   Country                                   327560 non-null  object 
 4   Province/State                            327560 non-null  object 
 5   Fraud and Cybercrime Thematic Categories  327560 non-null  object 
 6   Solicitation Method                       327560 non-null  object 
 7   Gender                                    327560 non-null  object 
 8   Language of Correspondence                327560 non-null  object 
 9   Victim Age Range                          327560 non-null  object 
 10  Complaint Type           

In [39]:
df.to_csv('Resources/cleaned_data.csv', index=False)