### PRE PROCESSING

#### Dataset used:
Pan-Indian Consumer Transaction Dataset [https://www.kaggle.com/datasets/kumarperiya/pan-indian-consumer-transaction-dataset]

This dataset, IndiaTransactMultiFacet2024, offers a broad and detailed panorama of consumer transactions across India, captured comprehensively across various categories including travel, entertainment, fitness & medical, and online shopping. Compiled with a focus on diversity and representativeness, it includes over 10,000+ individual records, each rich with multiple attributes such as transaction type, amount, consumer demographics, and merchant details.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [125]:
df = pd.read_csv('customer_data.csv.csv')

In [None]:
df.head

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
for column in df.columns:
    unique_vals = df[column].unique()
    print(f"Unique values in '{column} : ")
    print(unique_vals)
    print("\n")

In [133]:
# Filling the NaN values in these columns with the median value
columns = ['is_fraud', 'customer_id', 'merch_lat', 'merch_long', 'long', 'lat', 'amt', 'city_pop']

for column in columns:
    median_value = df[column].median()
    df[column].fillna(median_value, inplace=True)

In [134]:
# Filling the NaN values in these columns with the mode
columns = ['category', 'gender', 'city', 'state', 'job','first', 'last', 'merchant']

for column in columns:
    most_frequent = df[column].mode()[0] 
    df[column].fillna(most_frequent, inplace=True)

In [135]:
# Computing the age of the customer on the date of transaction
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce')

df['age'] = df.apply(
    lambda row: (row['trans_date_trans_time'].year - row['dob'].year - 
                 ((row['trans_date_trans_time'].month, row['trans_date_trans_time'].day) < 
                  (row['dob'].month, row['dob'].day))) 
    if pd.notnull(row['dob']) and pd.notnull(row['trans_date_trans_time']) 
    else np.nan, axis=1
)

In [136]:
median_age = df['age'].median()
df['age'].fillna(median_age, inplace=True)

In [137]:
# Dropping columns that aren't required (transaction id, credit card no, date of transaction & dob (as we've calculated the age) and street)
df = df.drop(columns=['trans_id', 'cc_num', 'street','dob', 'trans_date_trans_time'])

In [None]:
df.isnull().sum()

In [None]:
print(df.head())

In [None]:
df.info()

In [None]:
print(df.describe())

In [None]:
print(df.dtypes)

In [None]:
df.shape

In [None]:
print(df.columns.tolist())


### VISUALIZATION

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='category', y='age')
plt.title('Transaction Category vs Age')
plt.xlabel('Transaction Category')
plt.ylabel('Age')
plt.xticks(rotation=45) 
plt.show()

In [None]:
 '''
fig = px.scatter(df, x='age', y='amt', color='category', hover_data=['city', 'state'], title='Age vs Amount by Category')
fig.show()
'''

In [None]:
'''
fig = px.histogram(df, x='amt', color='category', marginal='box', title='Distribution of Amount by Category')
fig.show()
'''

In [None]:
'''
fig = px.scatter_3d(df, x='age', y='amt', z='city_pop', color='category', title='3D Scatter Plot of Age, Amount, and City Population')
fig.show()
'''

In [None]:
palette={'F': 'pink', 'M':'blue'}
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='category', hue='gender', palette=palette)
plt.title('Transaction Category vs Gender')
plt.xlabel('Transaction Category')
plt.ylabel('Count')
plt.xticks(rotation=45)  
plt.legend(title='Gender')
plt.show()

In [None]:
category_amounts = df.groupby('category')['amt'].sum()
plt.figure(figsize=(8, 8))
plt.pie(category_amounts, labels=category_amounts.index, autopct='%1.1f%%', colors=sns.color_palette('viridis', len(category_amounts)))
plt.title('Amount Distribution by Transaction Category')
plt.show()

In [None]:
state_amt = df.groupby('state')['amt'].sum().reset_index()
fig = px.bar(state_amt, x='state', y='amt', 
             title='Total Amount by State', 
             labels={'amt':'Total Amount', 'state':'State'},
             color='amt', 
             color_continuous_scale='Viridis')
fig.show()

In [None]:
state_category_pivot = df.pivot_table(index='state', columns='category', values='amt', aggfunc='sum')
fig = px.imshow(state_category_pivot,
                labels={'color':'Total Amount'},
                title='Heatmap of Total Amount Spent by State and Transaction Category',
                aspect="auto",
                color_continuous_scale='Viridis')
fig.show()

In [None]:
# One hot encoding the columns with categorical data
object_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=object_cols, drop_first=True)

In [None]:
df.info()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['amt_scaled', 'age_scaled', 'is_fraud_scaled']] = scaler.fit_transform(df[['amt', 'age', 'is_fraud']])


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['amt_scaled', 'age_scaled', 'is_fraud_scaled']] = scaler.fit_transform(df[['amt', 'age', 'is_fraud']])


In [None]:
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
kmeans = KMeans()
visualizer = KElbowVisualizer(kmeans, k=(2, 10))
visualizer.fit(df)  
visualizer.show()

In [None]:
model = KMeans(n_clusters = 4, random_state = 42)
model.fit(df)
df["Cluster"]=model.labels_
df.groupby("Cluster").agg({
    'amt_scaled': 'mean',
    'is_fraud_scaled': 'count',
    'age_scaled': 'mean'}).round(2)

In [None]:
df_new = pd.DataFrame(df, columns=['age_scaled','amt_scaled', 'is_fraud_scaled'])
df_new['ID'] = df.index
df_new['Cluster'] = model.labels_
# Melt The Data
df_nor_melt = pd.melt(df_new.reset_index(),
                      id_vars=['ID', 'Cluster'],
                      value_vars=['age_scaled','amt_scaled', 'is_fraud_scaled'],
                      var_name='Attribute',
                      value_name='Value')
df_nor_melt.head()
# Visualize it
sns.lineplot(x='Attribute', y='Value', hue='Cluster', data=df_nor_melt)
plt.show()

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import AffinityPropagation
no_of_clusters = []
preferences = range(-20000,-5000,100)
af_sil_score = [] # silouette scores

for p in preferences:
    AF = AffinityPropagation(preference=p, max_iter=200).fit(df_new)
    no_of_clusters.append((len(np.unique(AF.labels_))))
    af_sil_score.append(silhouette_score(df_new, AF.labels_))
    
af_results = pd.DataFrame([preferences, no_of_clusters, af_sil_score], index=['preference','clusters', 'sil_score']).T
af_results.sort_values(by='sil_score', ascending=False).head() 

In [None]:
import numpy as np
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import AffinityPropagation

# Ignore warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('customer_data.csv.csv')

# Filling the NaN values in numerical columns with the median
num_columns = ['is_fraud', 'customer_id', 'merch_lat', 'merch_long', 'long', 'lat', 'amt', 'city_pop']
for column in num_columns:
    df[column].fillna(df[column].median(), inplace=True)

# Filling the NaN values in categorical columns with the mode
cat_columns = ['category', 'gender', 'city', 'state', 'job', 'first', 'last', 'merchant']
for column in cat_columns:
    df[column].fillna(df[column].mode()[0], inplace=True)

# Convert dob and transaction date to datetime format
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce')

# Calculate age based on the transaction date and dob
df['age'] = df.apply(
    lambda row: (row['trans_date_trans_time'].year - row['dob'].year - 
                 ((row['trans_date_trans_time'].month, row['trans_date_trans_time'].day) < 
                  (row['dob'].month, row['dob'].day))) 
    if pd.notnull(row['dob']) and pd.notnull(row['trans_date_trans_time']) 
    else np.nan, axis=1
)

# Fill missing age values with the median age
df['age'].fillna(df['age'].median(), inplace=True)

# One-hot encoding for categorical variables
object_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=object_cols, drop_first=True)

# Scaling features
scaler = StandardScaler()
df[['amt_scaled', 'age_scaled', 'is_fraud_scaled']] = scaler.fit_transform(df[['amt', 'age', 'is_fraud']])

# Clustering with KMeans
kmeans_model = KMeans(n_clusters=4, random_state=42)
df['Cluster'] = kmeans_model.fit_predict(df)

# Summary of the clustering results
cluster_summary = df.groupby('Cluster').agg({
    'amt_scaled': 'mean',
    'is_fraud_scaled': 'count',
    'age_scaled': 'mean'
}).round(2)

# Affinity Propagation clustering and silhouette score calculation
df_new = df[['amt_scaled', 'age_scaled', 'is_fraud_scaled']]
preferences = range(-20000, -5000, 100)
af_sil_score = []
no_of_clusters = []

for p in preferences:
    af_model = AffinityPropagation(preference=p, max_iter=200).fit(df_new)
    no_of_clusters.append(len(np.unique(af_model.labels_)))
    af_sil_score.append(silhouette_score(df_new, af_model.labels_))

# Affinity Propagation results as a DataFrame
af_results = pd.DataFrame({
    'preference': preferences,
    'clusters': no_of_clusters,
    'sil_score': af_sil_score
}).sort_values(by='sil_score', ascending=False)

# Display the top silhouette score results
top_af_results = af_results.head()
print(top_af_results)


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

# Load the dataset
df = pd.read_csv('customer_data.csv.csv')

# Fill missing values in numeric columns with median
columns_numeric = ['is_fraud', 'customer_id', 'merch_lat', 'merch_long', 'long', 'lat', 'amt', 'city_pop']
for column in columns_numeric:
    df[column].fillna(df[column].median(), inplace=True)

# Fill missing values in categorical columns with mode
columns_categorical = ['category', 'gender', 'city', 'state', 'job', 'first', 'last', 'merchant']
for column in columns_categorical:
    df[column].fillna(df[column].mode()[0], inplace=True)

# Convert 'dob' and 'trans_date_trans_time' to datetime and calculate age
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce')

# Calculate the age and fill missing values with the median age
df['age'] = df.apply(
    lambda row: (row['trans_date_trans_time'].year - row['dob'].year - 
                 ((row['trans_date_trans_time'].month, row['trans_date_trans_time'].day) < 
                  (row['dob'].month, row['dob'].day))) 
    if pd.notnull(row['dob']) and pd.notnull(row['trans_date_trans_time']) 
    else np.nan, axis=1
)

df['age'].fillna(df['age'].median(), inplace=True)

# Select relevant columns for clustering (numeric features)
features = ['amt', 'age', 'is_fraud', 'city_pop']

# Normalize the selected features
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[features])

# Apply DBSCAN algorithm
dbscan_model = DBSCAN(eps=0.5, min_samples=5)
df['Cluster'] = dbscan_model.fit_predict(df_scaled)

# Display the resulting clusters and their counts
cluster_counts = df['Cluster'].value_counts()
print("Cluster counts:")
print(cluster_counts)

# Optional: Calculate silhouette score (for clusters greater than 1)
if len(set(df['Cluster'])) > 1:
    sil_score = silhouette_score(df_scaled, df['Cluster'])
    print(f"Silhouette Score: {sil_score:.2f}")
else:
    print("Silhouette score cannot be calculated because there is only one cluster.")

# Display sample records from each cluster
for cluster_label in df['Cluster'].unique():
    print(f"\nCluster {cluster_label} Sample Data:")
    print(df[df['Cluster'] == cluster_label].head())


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load the dataset
df = pd.read_csv('customer_data.csv.csv')

# Fill missing values in numeric columns with median
columns_numeric = ['is_fraud', 'customer_id', 'merch_lat', 'merch_long', 'long', 'lat', 'amt', 'city_pop']
for column in columns_numeric:
    df[column].fillna(df[column].median(), inplace=True)

# Fill missing values in categorical columns with mode
columns_categorical = ['category', 'gender', 'city', 'state', 'job', 'first', 'last', 'merchant']
for column in columns_categorical:
    df[column].fillna(df[column].mode()[0], inplace=True)

# Convert 'dob' and 'trans_date_trans_time' to datetime and calculate age
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce')

# Calculate the age and fill missing values with the median age
df['age'] = df.apply(
    lambda row: (row['trans_date_trans_time'].year - row['dob'].year - 
                 ((row['trans_date_trans_time'].month, row['trans_date_trans_time'].day) < 
                  (row['dob'].month, row['dob'].day))) 
    if pd.notnull(row['dob']) and pd.notnull(row['trans_date_trans_time']) 
    else np.nan, axis=1
)

df['age'].fillna(df['age'].median(), inplace=True)

# Select relevant columns for clustering (numeric features)
features = ['amt', 'age', 'is_fraud', 'city_pop']

# Normalize the selected features
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[features])

# Apply K-Means clustering
kmeans_model = KMeans(n_clusters=4, random_state=42)
df['Cluster'] = kmeans_model.fit_predict(df_scaled)

# Display the resulting clusters and their counts
cluster_counts = df['Cluster'].value_counts()
print("Cluster counts:")
print(cluster_counts)

# Calculate silhouette score
sil_score = silhouette_score(df_scaled, df['Cluster'])
print(f"Silhouette Score: {sil_score:.2f}")

# Display sample records from each cluster
for cluster_label in df['Cluster'].unique():
    print(f"\nCluster {cluster_label} Sample Data:")
    print(df[df['Cluster'] == cluster_label].head())

# Visualization
plt.figure(figsize=(12, 6))
sns.scatterplot(x='amt', y='age', hue='Cluster', data=df, palette='viridis', alpha=0.6)
plt.title('K-Means Clusters by Amount and Age')
plt.xlabel('Amount')
plt.ylabel('Age')
plt.legend(title='Cluster')
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(x='Cluster', y='amt', data=df, palette='viridis')
plt.title('Transaction Amount by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Transaction Amount')
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(x='Cluster', y='age', data=df, palette='viridis')
plt.title('Age Distribution by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Age')
plt.show()

# Save clustering results to JSON file
cluster_characteristics = df.groupby('Cluster').agg({
    'amt': ['mean', 'std'],
    'age': ['mean', 'std'],
    'is_fraud': ['mean', 'std'],
    'city_pop': ['mean', 'std'],
    'Cluster': 'count'
}).reset_index()

# Flatten the MultiIndex columns for JSON export
cluster_characteristics.columns = ['_'.join(col).strip() for col in cluster_characteristics.columns.values]
cluster_characteristics.rename(columns={'Cluster_count': 'Count'}, inplace=True)

# Convert DataFrame to dictionary and save as JSON
results_dict = cluster_characteristics.to_dict(orient='records')
with open('cluster_characteristics.json', 'w') as f:
    json.dump(results_dict, f, indent=4)

print("Cluster characteristics saved to 'cluster_characteristics.json'")


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

# Load the dataset

# Fill missing values in numeric columns with median
columns_numeric = ['is_fraud', 'customer_id', 'merch_lat', 'merch_long', 'long', 'lat', 'amt', 'city_pop']
for column in columns_numeric:
    df[column].fillna(df[column].median(), inplace=True)

# Fill missing values in categorical columns with mode
columns_categorical = ['category', 'gender', 'city', 'state', 'job', 'first', 'last', 'merchant']
for column in columns_categorical:
    df[column].fillna(df[column].mode()[0], inplace=True)

# Convert 'dob' and 'trans_date_trans_time' to datetime and calculate age
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce')

# Calculate the age and fill missing values with the median age
df['age'] = df.apply(
    lambda row: (row['trans_date_trans_time'].year - row['dob'].year - 
                 ((row['trans_date_trans_time'].month, row['trans_date_trans_time'].day) < 
                  (row['dob'].month, row['dob'].day))) 
    if pd.notnull(row['dob']) and pd.notnull(row['trans_date_trans_time']) 
    else np.nan, axis=1
)

df['age'].fillna(df['age'].median(), inplace=True)

# Select relevant columns for clustering (numeric features)
features = ['amt', 'age', 'is_fraud', 'city_pop']

# Normalize the selected features
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[features])

# Apply K-Means clustering
kmeans_model = KMeans(n_clusters=4, random_state=42)
df['Cluster'] = kmeans_model.fit_predict(df_scaled)

# Display the resulting clusters and their counts
cluster_counts = df['Cluster'].value_counts()
print("Cluster counts:")
print(cluster_counts)

# Calculate silhouette score
sil_score = silhouette_score(df_scaled, df['Cluster'])
print(f"Silhouette Score: {sil_score:.2f}")

# Display sample records from each cluster
for cluster_label in df['Cluster'].unique():
    print(f"\nCluster {cluster_label} Sample Data:")
    print(df[df['Cluster'] == cluster_label].head())

# Additional Characteristics
cluster_characteristics = df.groupby('Cluster').agg({
    'amt': ['mean', 'std', 'median', lambda x: x.max() - x.min()],
    'age': ['mean', 'std', 'median', lambda x: x.max() - x.min()],
    'city_pop': ['mean', 'std', 'median', lambda x: x.max() - x.min()],
    'is_fraud': lambda x: x.value_counts().to_dict(),
    'Cluster': 'count'
}).reset_index()

# Flatten the MultiIndex columns for JSON export
cluster_characteristics.columns = ['_'.join(col).strip() for col in cluster_characteristics.columns.values]
cluster_characteristics.rename(columns={'Cluster_count': 'Count', 'amt_<lambda_0>': 'amt_range', 'age_<lambda_0>': 'age_range', 'city_pop_<lambda_0>': 'city_pop_range'}, inplace=True)

# Convert DataFrame to dictionary and save as JSON
results_dict = cluster_characteristics.to_dict(orient='records')
with open('cluster_characteristics.json', 'w') as f:
    json.dump(results_dict, f, indent=4)

print("Cluster characteristics saved to 'cluster_characteristics.json'")

# Visualization

# Scatter plot
plt.figure(figsize=(12, 6))
sns.scatterplot(x='amt', y='age', hue='Cluster', data=df, palette='viridis', alpha=0.6)
plt.title('K-Means Clusters by Amount and Age')
plt.xlabel('Amount')
plt.ylabel('Age')
plt.legend(title='Cluster')
plt.show()

# Boxplots
plt.figure(figsize=(12, 6))
sns.boxplot(x='Cluster', y='amt', data=df, palette='viridis')
plt.title('Transaction Amount by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Transaction Amount')
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(x='Cluster', y='age', data=df, palette='viridis')
plt.title('Age Distribution by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Age')
plt.show()

# Pair Plot
sns.pairplot(df, hue='Cluster', vars=['amt', 'age', 'city_pop'], palette='viridis')
plt.show()

# Histograms
plt.figure(figsize=(14, 7))
for cluster_label in df['Cluster'].unique():
    plt.hist(df[df['Cluster'] == cluster_label]['amt'], bins=50, alpha=0.5, label=f'Cluster {cluster_label}')
plt.title('Transaction Amount Distribution by Cluster')
plt.xlabel('Transaction Amount')
plt.ylabel('Frequency')
plt.legend()
plt.show()

plt.figure(figsize=(14, 7))
for cluster_label in df['Cluster'].unique():
    plt.hist(df[df['Cluster'] == cluster_label]['age'], bins=50, alpha=0.5, label=f'Cluster {cluster_label}')
plt.title('Age Distribution by Cluster')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# PCA for dimensionality reduction and visualization
pca = PCA(n_components=3)
df_pca = pca.fit_transform(df_scaled)

plt.figure(figsize=(12, 6))
plt.scatter(df_pca[:, 0], df_pca[:, 1], c=df['Cluster'], cmap='viridis', alpha=0.6)
plt.colorbar(label='Cluster')
plt.title('PCA of Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()


In [None]:
import json
import google.generativeai as genai

# Configure Gemini API

# Function to read JSON file and prepare the input for Gemini API
def prepare_gemini_input(json_file_path):
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    # Format data as a string for the Gemini API prompt
    formatted_data = json.dumps(data, indent=4)
    return formatted_data

# Function to interact with the Gemini API
def get_gemini_results(formatted_data):
    # Define the prompt
    prompt = f"""
    Here is the detailed customer segmentation data:
    {formatted_data}

    1. Generate a Format, Professional title for each customer segment. Make sure the titles used are not using puns or captions or anything else. An example of a valid caption is "Rural Dwelling Citizen Cluster"
    2. Provide an analysis and detailed summary of each segment, including key characteristics and insights.
    3. Suggest marketing strategies for each segment.
    4. Offer recommendations for targeting and fraud prevention based on the segment characteristics.
    5. Do not give any cluster ID. Just say the cluster number and the title
    6. Also create a segment profile in a paragraph format which include a detailed psychological eval of customers of this segment
    """

    # Initialize Gemini model
    model = genai.GenerativeModel("gemini-1.5-flash")

    # Call Gemini API
    response = model.generate_content(prompt)

    return response.text

# Path to the JSON file
json_file_path = 'cluster_characteristics.json'
api_key = 'your_gemini_api_key_here'  # Replace with your actual Gemini API key

# Prepare input for Gemini
formatted_data = prepare_gemini_input(json_file_path)

# Get results from Gemini API
results = get_gemini_results(formatted_data)

# Print results
print("Gemini API Results:")
print(results)

# Optionally, save results to a file
with open('gemini_results.txt', 'w') as file:
    file.write(results)
