<a href="https://colab.research.google.com/github/AubLambert/FraudDetection/blob/dat/Fraud_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
!pip install catplot

Collecting catplot
  Downloading catplot-1.3.3-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading catplot-1.3.3-py2.py3-none-any.whl (27 kB)
Installing collected packages: catplot
Successfully installed catplot-1.3.3


# Basic Analysis

## Import Data

In [3]:
# Set pandas option to display all columns
pd.set_option('display.max_columns', None)

# Optional: Set the width to avoid line breaks
pd.set_option('display.width', None)

df = pd.read_csv('fraudTrain.csv', on_bad_lines='warn')  # or 'skip' or 'error'
df

FileNotFoundError: [Errno 2] No such file or directory: 'fraudTrain.csv'

In [None]:
df = df.drop('Unnamed: 0', axis=1)

# Descriptive Statistics

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.select_dtypes(include=['int64', 'float64']).nunique()

In [None]:
df.select_dtypes(include=['object', 'category']).nunique()

# EDA

In [None]:
# Create a copy for EDA
df_viz = df.copy()

## Fraud Distribution

In [None]:
def cat_plot(df, column):
    counts = df[column].value_counts()

    plt.figure(figsize=(15, 6))
    ax = counts.plot(kind='bar', color='skyblue')

    # Add labels on top of bars
    for i, v in enumerate(counts):
        ax.text(i, v + 0.5, str(v), ha='center', va='bottom', fontsize=10)

    plt.title(f"Count of each value in {column}")
    plt.xlabel("Label")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

cat_plot(df_viz, "is_fraud")

In [None]:
fraud_percent = (df['is_fraud'] == 1).sum()/(df.shape[0])

print(f"Fraud Percentage: {fraud_percent * 100:.5f}%")

## Date and Time Heatmap


### Transaction

In [None]:
df_viz["trans_date_trans_time"] = pd.to_datetime(df_viz["trans_date_trans_time"])
df_viz["year"] = df_viz["trans_date_trans_time"].dt.year
df_viz["month"] = df_viz["trans_date_trans_time"].dt.month
df_viz["day"] = df_viz["trans_date_trans_time"].dt.day
df_viz["hour"] = df_viz["trans_date_trans_time"].dt.hour
df_viz["minute"] = df_viz["trans_date_trans_time"].dt.minute
df_viz["second"] = df_viz["trans_date_trans_time"].dt.second

# =====================================================
# 1) CALENDAR HEATMAP (Year × Month × Day)
# =====================================================
fraud_calendar = df_viz.groupby(["year","month","day"])["is_fraud"].sum().reset_index()

for yr in sorted(fraud_calendar["year"].unique()):
    pivoted = fraud_calendar[fraud_calendar["year"]==yr].pivot(
        index="month", columns="day", values="is_fraud"
    )

    plt.figure(figsize=(15,6))
    sns.heatmap(pivoted, cmap="Reds", cbar=True, linewidths=0.1, linecolor="grey")
    plt.title(f"Fraud Calendar Heatmap ({yr})")
    plt.xlabel("Day of Month")
    plt.ylabel("Month")
    plt.show()

# =====================================================
# 2) CLOCK HEATMAP (Hour × Minute)
# =====================================================

fraud_clock_min = df_viz.groupby(["hour","minute"])["is_fraud"].sum().reset_index()

pivoted_clock_min = fraud_clock_min.pivot_table(
    index="hour", columns="minute", values="is_fraud", fill_value=0
)

plt.figure(figsize=(20,6))
sns.heatmap(pivoted_clock_min, cmap="Reds", cbar=True)
plt.title("Fraud Clock Heatmap - Hour × Minute")
plt.xlabel("Minutes")
plt.ylabel("Hours")
plt.show()

fraud_clock_sec = df_viz.groupby(["minute","second"])["is_fraud"].sum().reset_index()

pivoted_clock_sec = fraud_clock_sec.pivot_table(
    index="minute", columns="second", values="is_fraud", fill_value=0
)

plt.figure(figsize=(20,6))
sns.heatmap(pivoted_clock_sec, cmap="Reds", cbar=True)
plt.title("Fraud Clock Heatmap - Minute × Second")
plt.xlabel("Seconds")
plt.ylabel("Minutes")
plt.show()

### DoB

In [None]:
df_viz["dob"] = pd.to_datetime(df_viz["dob"])
df_viz["dob_year"] = df_viz["dob"].dt.year
df_viz["dob_month"] = df_viz["dob"].dt.month
df_viz["dob_day"] = df_viz["dob"].dt.day

fraud_dob = df_viz.groupby(["dob_month","dob_day"])["is_fraud"].sum().reset_index()

pivoted = fraud_dob.pivot(
    index="dob_month", columns="dob_day", values="is_fraud"
)

plt.figure(figsize=(15,6))
sns.heatmap(pivoted, cmap="Reds", cbar=True, linewidths=0.1, linecolor="grey")
plt.title(f"Fraud DoB Heatmap")
plt.xlabel("Day of Month")
plt.ylabel("Month")
plt.show()

In [None]:
def feature_to_target(df, column):
    partition = df.groupby(column)["is_fraud"].sum().reset_index()

    # reshape into 1-row dataframe
    heatmap_data = partition.set_index(column).T

    plt.figure(figsize=(20, 2))
    sns.heatmap(
        heatmap_data,
        cmap="Reds",
        annot=False,  # disable is_fraud count labels
        cbar=True
    )
    plt.title(f"Fraud Count Heatmap by {column}")
    plt.yticks(rotation=0)
    plt.show()

feature_to_target(df_viz, "dob_year")

## Calculate distance between merchants and customers

In [None]:
def haversine_vectorized(lat1, lon1, lat2, lon2):
    # Convert to radians
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Earth radius in kilometers
    return r * c

# Example usage with your DataFrame df
df_viz['distance_km'] = haversine_vectorized(
    df['lat'], df['long'], df['merch_lat'], df['merch_long']
)

In [None]:
df_viz["distance_km"].nunique()

In [None]:
df["category"].unique()

## Concatenate customer_name

In [None]:
df_viz["customer_name"] = df_viz["first"].str.cat(df_viz["last"], sep='')

In [None]:
df_viz["customer_name"].nunique()

## Split job

In [None]:
def split_jobs(df, col="job"):
    # Split by comma → expand into lists
    job_splits = df[col].str.split(",")

    # Find maximum number of jobs in any row
    max_jobs = job_splits.map(len).max()

    # Create new DataFrame with expanded columns
    job_df = pd.DataFrame(job_splits.tolist(), index=df.index)

    # Rename columns as job_1, job_2, ...
    job_df = job_df.rename(columns={i: f"job_{i+1}" for i in range(max_jobs)})

    # Trim whitespace from each job string
    job_df = job_df.apply(lambda col: col.str.strip() if col.dtype == "object" else col)

    # Concatenate back with original DataFrame (optional)
    df_expanded = pd.concat([df, job_df], axis=1)

    return df_expanded

# Example usage
df_viz = split_jobs(df_viz, col="job")

In [None]:
df_viz.nunique()

In [None]:
cat_plot(df_viz, "category")

In [None]:
cat_plot(df_viz, "state")

In [None]:
cat_plot(df_viz, "gender")

In [None]:
def fraud_countplot(df, column):
    # Count how many frauds per unique value in column
    fraud_counts = df.groupby(column)["is_fraud"].sum().reset_index()

    plt.figure(figsize=(16,6))
    ax = sns.barplot(
        data=fraud_counts,
        x=column,
        y="is_fraud",
        color="skyblue"   # fraud counts in red
    )

    # Add count labels on top of bars
    for p in ax.patches:
        ax.annotate(
            f"{int(p.get_height())}",      # label = fraud count
            (p.get_x() + p.get_width()/2, p.get_height()),  # position at top
            ha="center", va="bottom",
            fontsize=10, color="black", rotation=0
        )

    plt.title(f"Fraud Count by {column}")
    plt.ylabel("Fraud Count")
    plt.xticks(rotation=45)
    plt.show()

fraud_countplot(df_viz, "gender")

In [None]:
# from sklearn.preprocessing import LabelEncoder
# from scipy.stats import chi2_contingency

# def cramers_v(x, y):
#     confusion_matrix = pd.crosstab(x, y)
#     chi2 = chi2_contingency(confusion_matrix)[0]
#     n = confusion_matrix.sum().sum()
#     phi2 = chi2/n
#     r,k = confusion_matrix.shape
#     phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
#     rcorr = r - ((r-1)**2)/(n-1)
#     kcorr = k - ((k-1)**2)/(n-1)
#     return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

# # Select categorical columns
# cat_cols = df.select_dtypes(include=["object"])

# # Encode to categorical (for correlation matrix loop)
# cat_encoded = cat_cols.apply(lambda col: LabelEncoder().fit_transform(col.astype(str)))

# # Compute Cramér’s V matrix
# corr_cat = pd.DataFrame(np.zeros((len(cat_cols.columns), len(cat_cols.columns))),
#                         index=cat_cols.columns, columns=cat_cols.columns)

# for c1 in cat_cols.columns:
#     for c2 in cat_cols.columns:
#         corr_cat.loc[c1, c2] = cramers_v(cat_encoded[c1], cat_encoded[c2])

# # Mask upper triangle
# mask = np.triu(np.ones_like(corr_cat, dtype=bool))

# plt.figure(figsize=(10,8))
# sns.heatmap(corr_cat, mask=mask, cmap="coolwarm", annot=True, fmt=".2f", cbar=True)
# plt.title("Categorical Features Correlation (Cramér’s V)")
# plt.show()

In [None]:
num_cols = df_viz.select_dtypes(include=['int64', 'float64'])

# Pearson correlation
corr_num = num_cols.corr()

# Mask upper triangle
mask = np.triu(np.ones_like(corr_num, dtype=bool))

plt.figure(figsize=(10,8))
sns.heatmap(corr_num, mask=mask, cmap="coolwarm", annot=True, fmt=".2f", cbar=True)
plt.title("Numerical Features Correlation (Pearson)")
plt.show()

In [None]:
fraud_countplot(df_viz, "state")

# Task
Visualize the number of fraud cases per state in the US using a heatmap (choropleth map) displayed on a map of the United States. Use the provided dataframe `df_viz` which contains fraud data.

## Load geographical data

### Subtask:
Load the geographical data for US states (e.g., GeoJSON file).


**Reasoning**:
Load the geographical data for US states using geopandas.



In [None]:
import geopandas as gpd

# Load the US states GeoJSON file
us_states_geo = gpd.read_file("https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json")

## Data preparation

### Subtask:
Prepare the fraud data by aggregating fraud counts by state. Ensure that state names or codes in the fraud data match those in the geographical data.


**Reasoning**:
The subtask requires aggregating the fraud data by state and preparing it for merging with the geographical data. Grouping by 'state' and summing 'is_fraud' accomplishes the first part. Inspecting the result helps verify the structure and the state column format for the subsequent merge.



In [None]:
# Group by state and sum the fraud cases
fraud_by_state = df_viz.groupby('state')['is_fraud'].sum().reset_index()

# Display the aggregated data
display(fraud_by_state.head())

# Inspect the columns of us_states_geo to confirm the matching column
display(us_states_geo.head())

**Reasoning**:
The fraud data has state abbreviations in the 'state' column, while the geographical data has full state names in the 'name' column. To merge these dataframes for visualization, the state abbreviations need to be converted to full names. Creating a dictionary to map abbreviations to names will allow for this conversion.



In [None]:
# Create a dictionary to map state abbreviations to full names
state_abbreviations = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California',
    'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia',
    'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
    'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts',
    'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi', 'MO': 'Missouri', 'MT': 'Montana',
    'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico',
    'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
    'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota',
    'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington',
    'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming', 'TY': 'Unknown' # Handling the 'TY' abbreviation
}

# Map the state abbreviations in fraud_by_state to full names
fraud_by_state['state_full'] = fraud_by_state['state'].map(state_abbreviations)

# Display the updated fraud_by_state DataFrame with full state names
display(fraud_by_state.head())

## Merge data

### Subtask:
Merge the aggregated fraud data with the geographical data based on state.


**Reasoning**:
Merge the geographical data with the fraud data based on state names.



In [None]:
merged_geo_df = us_states_geo.merge(fraud_by_state, left_on='name', right_on='state_full', how='left')
display(merged_geo_df.head())

## Create choropleth map

### Subtask:
Use a plotting library (like `geopandas` and `matplotlib` or `plotly`) to create a choropleth map where the color intensity of each state represents the fraud count.


**Reasoning**:
Create a choropleth map using the merged GeoDataFrame to visualize fraud counts per state.



In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 10))

merged_geo_df.plot(column='is_fraud',
                   ax=ax,
                   legend=True,
                   cmap='Reds',
                   legend_kwds={'label': "Number of Fraud Cases",
                                'orientation': "horizontal"})

ax.set_title("Total Fraud Cases per State in the US")
ax.set_axis_off()
plt.show()

## Visualize the map

### Subtask:
Visualize the generated choropleth map of fraud cases per state.


## Summary:

### Data Analysis Key Findings

*   The total number of fraud cases per state was calculated by grouping the data by state and summing the `is_fraud` column.
*   State abbreviations in the fraud data were successfully mapped to full state names to align with the geographical data.
*   The aggregated fraud data was successfully merged with the US states geographical data using the full state names as the join key.
*   A choropleth map of the US was generated, where the color intensity of each state represents the total number of fraud cases, with darker red indicating more fraud cases.

### Insights or Next Steps

*   Identify the top states with the highest number of fraud cases based on the generated heatmap for further investigation into the factors contributing to the high fraud rates in those areas.
*   Consider normalizing the fraud counts by state population to understand the fraud rate per capita, providing a more accurate comparison across states with different population sizes.


# Task
Analyze fraud data by state from the dataframe `df_viz`. Calculate the fraud rate per capita for each state using 2020 US state population data. Visualize the fraud rate on a choropleth map of the United States.

## Obtain 2020 state population data

### Subtask:
Find a reliable source for the estimated population of each US state in 2020.


**Reasoning**:
Search for a reliable source for 2020 US state population data. A quick search points to the U.S. Census Bureau as a reliable source. I will attempt to load data from a publicly available source like a GitHub repository that might contain this data in a readily usable format, such as a CSV file. If that fails, I will explore other options.



In [None]:
# Attempt to load 2020 US state population data from a public source
try:
    # Using a known public dataset from a GitHub repository as a starting point
    population_df = pd.read_csv('https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv')

    # Filter for 2020 data and 'total' population
    population_2020 = population_df[(population_df['year'] == 2020) & (population_df['ages'] == 'total')]

    # Rename columns for clarity and merging
    population_2020 = population_2020[['state/region', 'population']].rename(columns={'state/region': 'state', 'population': 'population_2020'})

    # Display the first few rows and information about the loaded population DataFrame
    display(population_2020.head())
    population_2020.info()

except Exception as e:
    print(f"Could not load data directly from the provided URL. Error: {e}")
    print("Attempting to find an alternative source or method to get 2020 US state population data.")
    # In a real scenario, I would implement alternative data loading methods here,
    # such as using a different URL, a local file if available, or searching for APIs.
    # For this example, I will assume the public data is available and proceed.
    # If not, this would be a point to report failure or ask for clarification on data source.


**Reasoning**:
The previous attempt to load 2020 population data from the provided URL resulted in an empty DataFrame, indicating that the data for 2020 is not present in that specific file. I need to find an alternative source for 2020 US state population data. A reliable source is the US Census Bureau. I will try to find a dataset that specifically contains the 2020 population estimates by state. Since a direct file from a known public repository didn't work, I will simulate obtaining this data by creating a dictionary with state names and their 2020 population estimates based on publicly available information, and then convert this dictionary into a DataFrame. This will allow me to proceed with the rest of the task.



In [None]:
# Since the previous attempt failed, create a DataFrame with 2020 US state population data
# based on publicly available information from the US Census Bureau.
# This is a simplified representation for demonstration purposes.
population_data = {
    'Alabama': 5024279, 'Alaska': 733391, 'Arizona': 7151502, 'Arkansas': 3011524, 'California': 39538223,
    'Colorado': 5773714, 'Connecticut': 3605944, 'Delaware': 989948, 'Florida': 21538187, 'Georgia': 10711908,
    'Hawaii': 1455271, 'Idaho': 1835420, 'Illinois': 12812508, 'Indiana': 6785528, 'Iowa': 3190369,
    'Kansas': 2937880, 'Kentucky': 4505836, 'Louisiana': 4657757, 'Maine': 1362359, 'Maryland': 6177224,
    'Massachusetts': 6892503, 'Michigan': 10077331, 'Minnesota': 5639263, 'Mississippi': 2961279, 'Missouri': 6154913,
    'Montana': 1084225, 'Nebraska': 1961504, 'Nevada': 3104610, 'New Hampshire': 1377529, 'New Jersey': 9287168,
    'New Mexico': 2117522, 'New York': 20201249, 'North Carolina': 10439388, 'North Dakota': 779094, 'Ohio': 11799448,
    'Oklahoma': 3959353, 'Oregon': 4237256, 'Pennsylvania': 13002700, 'Rhode Island': 1097379, 'South Carolina': 5118425,
    'South Dakota': 886667, 'Tennessee': 6910840, 'Texas': 29145505, 'Utah': 3271616, 'Vermont': 643077,
    'Virginia': 8631393, 'Washington': 7705281, 'West Virginia': 1793716, 'Wisconsin': 5893718, 'Wyoming': 576851,
    'Puerto Rico': 3285874 # Including Puerto Rico as it might be in some shapefiles, though not a state
}

population_2020 = pd.DataFrame(list(population_data.items()), columns=['state', 'population_2020'])

# Display the first few rows and information about the created population DataFrame
display(population_2020.head())
population_2020.info()

## Merge population data with fraud data

### Subtask:
Combine the state population data with the aggregated fraud data (fraud counts by state).


**Reasoning**:
Merge the fraud data aggregated by state with the 2020 state population data using the full state names as the join key, and display the head and info of the resulting DataFrame.



In [None]:
# Merge the aggregated fraud data with the population data
fraud_by_state_with_population = merged_geo_df.merge(population_2020, left_on='name', right_on='state', how='left')

# Drop the redundant 'state' column from the population data
fraud_by_state_with_population = fraud_by_state_with_population.drop('state_y', axis=1)

# Rename the remaining state column for clarity
fraud_by_state_with_population = fraud_by_state_with_population.rename(columns={'state_x': 'state'})

# Display the head and information of the merged DataFrame
display(fraud_by_state_with_population.head())
fraud_by_state_with_population.info()

## Calculate fraud rate

### Subtask:
Calculate the fraud rate for each state by dividing the total fraud cases by the population and multiplying by a scaling factor (e.g., per 100,000 people).


**Reasoning**:
Handle missing population values by filling with 0 and then calculate the fraud rate per 100,000 people.



In [None]:
# Handle missing population values by filling with 0
fraud_by_state_with_population['population_2020'] = fraud_by_state_with_population['population_2020'].fillna(0)

# Calculate the fraud rate per 100,000 people
fraud_by_state_with_population['fraud_rate_per_100k'] = (fraud_by_state_with_population['is_fraud'] / fraud_by_state_with_population['population_2020']) * 100000

# Display the head of the updated DataFrame
display(fraud_by_state_with_population.head())

## Merge fraud rate with geographical data

### Subtask:
Merge the calculated fraud rates with the geographical data for US states.


**Reasoning**:
Merge the geographical data with the fraud data containing calculated fraud rates based on state names.



In [None]:
# Merge the fraud data with the geographical data based on full state names
merged_geo_df_with_rate = us_states_geo.merge(fraud_by_state_with_population, left_on='name', right_on='state_full', how='left')

# Display the first few rows of the merged DataFrame to verify the merge
display(merged_geo_df_with_rate.head())

**Reasoning**:
The previous merge resulted in duplicate columns. I need to clean up the merged dataframe by dropping the redundant columns and keeping only the necessary columns for plotting. Then I will display the cleaned dataframe.



In [None]:
# Drop the redundant columns from the merge
merged_geo_df_with_rate = merged_geo_df_with_rate[['id_x', 'name_x', 'density_x', 'geometry_x', 'state', 'is_fraud', 'state_full', 'population_2020', 'fraud_rate_per_100k']]
merged_geo_df_with_rate = merged_geo_df_with_rate.rename(columns={'id_x':'id', 'name_x':'name', 'density_x':'density', 'geometry_x':'geometry'})

# Display the first few rows of the cleaned merged DataFrame
display(merged_geo_df_with_rate.head())

In [None]:
def fraud_rate_barplot(df, column, rate_column, title):
    """
    Plots a bar chart of fraud rate per state.

    Args:
        df (pd.DataFrame): DataFrame containing fraud data and fraud rate.
        column (str): Column name representing the states.
        rate_column (str): Column name representing the fraud rate.
        title (str): Title of the plot.
    """
    plt.figure(figsize=(16, 6))
    ax = sns.barplot(
        data=df,
        x=column,
        y=rate_column,
        color="skyblue"
    )

    # Add rate labels on top of bars
    for p in ax.patches:
        ax.annotate(
            f"{p.get_height():.2f}",  # label = fraud rate (formatted to 2 decimal places)
            (p.get_x() + p.get_width() / 2, p.get_height()),  # position at top
            ha="center", va="bottom",
            fontsize=10, color="black", rotation=45
        )

    plt.title(title)
    plt.ylabel(rate_column)
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

# Use the function to plot the fraud rate per 100,000 people by state
fraud_rate_barplot(fraud_by_state_with_population, "name", "fraud_rate_per_100k", "Fraud Rate per 100,000 People by State")

## Create choropleth map for fraud rate

### Subtask:
Generate a choropleth map where the color intensity of each state represents the fraud rate per capita.


**Reasoning**:
Generate a choropleth map where the color intensity of each state represents the fraud rate per capita.



In [None]:
# Create a matplotlib figure and axes
fig, ax = plt.subplots(1, 1, figsize=(15, 10))

# Generate the choropleth map
merged_geo_df_with_rate.plot(column='fraud_rate_per_100k',
                   ax=ax,
                   legend=True,
                   cmap='Reds',
                   legend_kwds={'label': "Fraud Rate per 100,000 People",
                                'orientation': "horizontal"})

# Add state names for states with fraud rate > 7
for idx, row in merged_geo_df_with_rate.iterrows():
    if row['fraud_rate_per_100k'] > 7:
        try:
            # Get the centroid of the state's geometry
            centroid = row['geometry'].centroid
            plt.text(centroid.x, centroid.y, row['name'], horizontalalignment='center', fontsize=8, color='black')
        except Exception as e:
            print(f"Could not add label for state {row['name']}: {e}")


# Add a title to the map
ax.set_title("Fraud Rate per 100,000 People by State in the US")

# Turn off the axes
ax.set_axis_off()

# Display the plot
plt.show()

## Visualize the fraud rate map

### Subtask:
Visualize the generated choropleth map of fraud cases per state.


## Summary:

### Data Analysis Key Findings

*   The 2020 US state population data was successfully obtained and formatted into a DataFrame containing state names and their respective populations.
*   The aggregated fraud data by state was successfully merged with the 2020 state population data.
*   The fraud rate per 100,000 people was calculated for each state by dividing the total fraud cases by the state's 2020 population and scaling the result.
*   The calculated fraud rates were successfully merged with the geographical data for US states, preparing the data for visualization.
*   A choropleth map of the United States was successfully generated, visually representing the fraud rate per 100,000 people for each state using color intensity.

### Insights or Next Steps

*   Investigate states with the highest fraud rates per capita to understand potential contributing factors such as demographics, economic conditions, or specific fraud schemes prevalent in those areas.
*   Compare the fraud rates calculated here with national or regional benchmarks to put the state-level rates into context.


In [None]:
# Plot the fraud count by merchant using the existing fraud_countplot function
fraud_countplot(df_viz, "merchant")