# RFM & Clustering

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the data into a pandas DataFrame
df = pd.read_csv('data.csv')

# Plot the clusters using a scatter plot
colors = ['red', 'green', 'blue', 'orange', 'purple', 'brown']
fig, ax = plt.subplots()
for i, cluster in enumerate(df['cluster'].unique()):
    cluster_data = df[df['cluster'] == cluster]
    ax.scatter(cluster_data['dept_name'], cluster_data['total_price'], 
                c=colors[i], label=f'Cluster {cluster}')
ax.legend()
ax.set_xlabel('Department Name')
ax.set_ylabel('Total Price')
plt.xticks(rotation=90) # Rotate the x-axis labels for better readability
plt.show()

"cluster" column indicating which cluster each data point belongs to, a "dept_name" column containing the department name for each transaction, and a "total_price" column containing the total price for each transaction.
    
   
The resulting plot will show the total price on the y-axis and the department name on the x-axis, with each cluster represented by a different color or symbol. This will allow you to visually compare the clusters and identify any patterns or differences between them based on the department they belong to.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the data into a pandas DataFrame
df = pd.read_csv('data.csv')

# Define a dictionary of colors for each cluster
colors = {0: 'red', 1: 'green', 2: 'blue', 3: 'orange', 4: 'purple', 5: 'brown'}

# Define a dictionary of markers for each cluster
markers = {0: 'o', 1: 's', 2: '^', 3: 'D', 4: 'P', 5: '*'}

# Define a dictionary of sizes for each cluster
sizes = {0: 30, 1: 50, 2: 70, 3: 90, 4: 110, 5: 130}

# Plot the clusters using a scatter plot with different colors, markers, and sizes
fig, ax = plt.subplots()
for i, cluster in enumerate(df['cluster'].unique()):
    cluster_data = df[df['cluster'] == cluster]
    ax.scatter(cluster_data['dept_name'], cluster_data['total_price'], 
                c=colors[cluster], marker=markers[cluster], s=sizes[cluster], 
                label=f'Cluster {cluster}')
ax.legend()
ax.set_xlabel('Department Name')
ax.set_ylabel('Total Price')
plt.xticks(rotation=90) # Rotate the x-axis labels for better readability
plt.show()

"cluster" column indicating which cluster each data point belongs to, a "customer_id" column containing the customer ID for each transaction, a "dept_name" column containing the department name for each transaction, and a "total_price" column containing the total price for each transaction.

The resulting 3D scatter plot will show the total price on the z-axis, the customer ID on the y-axis, and the department name on the x-axis, with each cluster represented by a different color, marker, and size. This will allow you to see the relationship between the different variables and how they relate to each cluster.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data into a pandas DataFrame
df = pd.read_csv('data.csv')

# Create a boxplot for each department within Cluster 0 with a more colorful style
sns.set(style='darkgrid')  # Change the style parameter to 'darkgrid'

cluster_data = df[df['cluster'] == 0]

for department in cluster_data['dept_name'].unique():
    department_data = cluster_data[cluster_data['dept_name'] == department]
    plt.figure(figsize=(10, 6))
    ax = sns.boxplot(x='dept_name', y='total_price', data=department_data)
    ax.set_title(f'Cluster 0 - {department} Department')
    plt.show()


unique() method to get an array of unique dept_name values and then loop over that array using a for loop. For each department, we subset the data to only include transactions for that department within Cluster 0, create a boxplot using seaborn, and set the plot title to include the department name.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data into a pandas DataFrame
df = pd.read_csv('data.csv')

# Compute the average total price for each department within each cluster
avg_price_data = df.groupby(['cluster', 'dept_name'])['total_price'].mean().reset_index()

# Create a barplot for each department within each cluster showing the average total price
sns.set(style='whitegrid')

for cluster in avg_price_data['cluster'].unique():
    cluster_data = avg_price_data[avg_price_data['cluster'] == cluster]
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(x='dept_name', y='total_price', data=cluster_data)
    ax.set_title(f'Cluster {cluster} - Average Total Price by Department')
    ax.set_xlabel('Department')
    ax.set_ylabel('Average Total Price')
    plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data into a pandas DataFrame
df = pd.read_csv('data.csv')

# Compute the average total price for each department within each cluster
avg_price_data = df.groupby(['cluster', 'dept_name'])['total_price'].mean().reset_index()

# Create a point plot for each department within each cluster showing the average total price
sns.set(style='whitegrid')

for cluster in avg_price_data['cluster'].unique():
    cluster_data = avg_price_data[avg_price_data['cluster'] == cluster]
    plt.figure(figsize=(10, 6))
    ax = sns.pointplot(x='dept_name', y='total_price', hue='dept_name', data=cluster_data)
    ax.set_title(f'Cluster {cluster} - Average Total Price by Department')
    ax.set_xlabel('Department')
    ax.set_ylabel('Average Total Price')
    plt.show()


groupby() to group the original DataFrame df by both cluster and dept_name, and then calculate the average total_price using the mean() method. We store the resulting DataFrame in a variable called avg_price_data, which has columns for cluster, dept_name, and total_price. We then loop through each unique value of cluster in avg_price_data, filter avg_price_data to include only data for that cluster, and create a barplot of the average total price for each department using Seaborn's barplot() function. We also set the x-axis label to "Department" and the y-axis label to "Average Total Price" using the set_xlabel() and set_ylabel() methods on the ax object, respectively.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data into a pandas DataFrame
df = pd.read_csv('data.csv')

# Create a boxplot for each cluster showing the distribution of total price
sns.set(style='whitegrid')
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(x='cluster', y='total_price', data=df, ax=ax, width=0.5, height=6)
ax.set_title('Total Price by Cluster')
ax.set_xlabel('Cluster')
ax.set_ylabel('Total Price')
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data into a pandas DataFrame
df = pd.read_csv('data.csv')

# Compute the average total price for each department within each cluster
avg_price_data = df.groupby(['cluster', 'dept_name'])['total_price'].mean().reset_index()

# Create a point plot for each department within each cluster showing the average total price
sns.set(style='whitegrid')

figsize = (10, 6)  # Set the same figure size for all the plots

for cluster in avg_price_data['cluster'].unique():
    cluster_data = avg_price_data[avg_price_data['cluster'] == cluster]
    ax = sns.pointplot(x='dept_name', y='total_price', hue='dept_name', data=cluster_data,
                        palette=sns.color_palette('husl', len(cluster_data['dept_name'].unique())),
                        figsize=figsize)  # Set the same figure size for all the plots
    ax.set_title(f'Cluster {cluster} - Average Total Price by Department')
    ax.set_xlabel('Department')
    ax.set_ylabel('Average Total Price')
    ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
    ax.set_ylim([0, max_avg_price + 2000])
    for i in range(len(cluster_data)):
        dept_name = cluster_data.iloc[i]['dept_name']
        avg_price = cluster_data.iloc[i]['total_price']
        ax.text(i, avg_price, dept_name, ha='center', va='bottom')
    plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data into a pandas DataFrame
df = pd.read_csv('data.csv')

# Compute the average total price for each department within each cluster
avg_price_data = df.groupby(['cluster', 'dept_name'])['total_price'].mean().reset_index()

# Create a point plot for each department within each cluster showing the average total price
sns.set(style='whitegrid')

for cluster in avg_price_data['cluster'].unique():
    cluster_data = avg_price_data[avg_price_data['cluster'] == cluster]
    fig, ax = plt.subplots(figsize=(12, 8))
    ax = sns.pointplot(x='dept_name', y='total_price', hue='dept_name', data=cluster_data,
                        palette=sns.color_palette('husl', len(cluster_data['dept_name'].unique())))
    ax.set_title(f'Cluster {cluster} - Average Total Price by Department')
    ax.set_xlabel('Department')
    ax.set_ylabel('Average Total Price')
    ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
    ax.tick_params(axis='x', labelrotation=45, labelsize=10)  # Adjust x-axis labels
    ax.tick_params(axis='y', labelsize=12)  # Adjust y-axis labels
    plt.ylim(0, 5000)
    plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data into a pandas DataFrame
df = pd.read_csv('data.csv')

# Compute the average total price for each department within each cluster
avg_price_data = df.groupby(['cluster', 'dept_name'])['total_price'].mean().reset_index()

# Create a point plot for each department within each cluster showing the average total price
sns.set(style='whitegrid')

# Define the class names for the clusters
class_names = ['Class 1', 'Class 2', 'Class 3', 'Class 4', 'Class 5', 'Class 6']

for i, cluster in enumerate(avg_price_data['cluster'].unique()):
    cluster_data = avg_price_data[avg_price_data['cluster'] == cluster]
    plt.figure(figsize=(10, 6))
    ax = sns.pointplot(x='dept_name', y='total_price', hue='cluster', data=cluster_data,
                        palette=sns.color_palette('husl', len(cluster_data['cluster'].unique())),
                        hue_order=avg_price_data['cluster'].unique())
    ax.set_title(f'{class_names[i]} - Average Total Price by Department')
    ax.set_xlabel('Department')
    ax.set_ylabel('Average Total Price')
    ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
    plt.show()


High spenders: customers who consistently spend a lot across all departments

Bargain hunters: customers who mainly buy from low-priced departments

Occasional shoppers: customers who make infrequent purchases across various departments

Health-conscious: customers who mainly purchase from the health and wellness department

Luxury seekers: customers who frequently purchase from high-end departments such as jewelry or home decor

Pet owners: customers who frequently purchase from the pet care department

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data into a pandas DataFrame
df = pd.read_csv('data.csv')

# Compute the average total price for each department within each cluster
avg_price_data = df.groupby(['cluster', 'dept_name'])['total_price'].mean().reset_index()

# Define the class names for each cluster
class_names = {0: 'Class A', 1: 'Class B', 2: 'Class C', 3: 'Class D', 4: 'Class E', 5: 'Class F'}

# Create a point plot for each department within each cluster showing the average total price
sns.set(style='whitegrid')

figsize = (10, 6)  # Set the same figure size for all the plots

for cluster in avg_price_data['cluster'].unique():
    cluster_data = avg_price_data[avg_price_data['cluster'] == cluster]
    ax = sns.pointplot(x='dept_name', y='total_price', hue='dept_name', data=cluster_data,
                        palette=sns.color_palette('husl', len(cluster_data['dept_name'].unique())),
                        figsize=figsize)  # Set the same figure size for all the plots
    ax.set_title(f'{class_names[cluster]} - Average Total Price by Department')
    ax.set_xlabel('Department')
    ax.set_ylabel('Average Total Price')
    ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
    ax.set_ylim([0, max_avg_price + 2000])
    for i in range(len(cluster_data)):
        dept_name = cluster_data.iloc[i]['dept_name']
        avg_price = cluster_data.iloc[i]['total_price']
        ax.text(i, avg_price, dept_name, ha='center', va='bottom')
    plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data into a pandas DataFrame
df = pd.read_csv('data.csv')

# Define the class names for each cluster
class_names = {0: 'Class A', 1: 'Class B', 2: 'Class C', 3: 'Class D', 4: 'Class E', 5: 'Class F'}

# Count the number of occurrences of each age within each class
age_counts = df.groupby(['age', 'cluster']).size().reset_index(name='count')

# Pivot the data to have class names as columns and age as index
age_counts_pivot = age_counts.pivot(index='age', columns='cluster', values='count')

# Create a bar plot of age distribution with class names
sns.set(style='whitegrid')

plt.figure(figsize=(12, 6))
age_counts_pivot.plot(kind='bar', stacked=True, width=0.8)
plt.title('Age Distribution by Class')
plt.xlabel('Age')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.legend([class_names[i] for i in age_counts_pivot.columns], loc='upper right')
plt.show()


first count the number of occurrences of each age within each class using the groupby() function. Then, we pivot the data to have class names as columns and age as the index. This allows us to create a bar plot with stacked bars representing the age distribution for each class. We set the title, x-label, and y-label accordingly. The x-tick labels represent the different age values, and the legend represents the class names.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data into a pandas DataFrame
df = pd.read_csv('data.csv')

# Create a bar chart with gender and cluster labels
sns.set(style='whitegrid')

plt.figure(figsize=(10, 6))
ax = sns.countplot(x='gender', hue='cluster', data=df)
plt.title('Gender Distribution by Cluster')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Cluster')
plt.show()


the countplot function is used to create a bar chart. The x parameter specifies the variable to be plotted on the x-axis (in this case, 'gender'), and the hue parameter specifies the variable used for grouping the bars (in this case, 'cluster'). The data is passed as the data parameter.

You can customize the plot further by adjusting the figure size (plt.figure(figsize=(10, 6))), setting the title and axis labels, and adding a legend.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data into a pandas DataFrame
df = pd.read_csv('data.csv')

# Define the class names for each cluster
class_names = {0: 'Class A', 1: 'Class B', 2: 'Class C', 3: 'Class D', 4: 'Class E', 5: 'Class F'}

# Get the top 10 job titles by count
top_job_titles = df['job_title'].value_counts().nlargest(10)

# Filter the data for customers with top job titles
df_top_jobs = df[df['job_title'].isin(top_job_titles.index)]

# Group the data by cluster and job title and count the number of occurrences
job_title_counts = df_top_jobs.groupby(['cluster', 'job_title']).size().reset_index(name='count')

# Create a bar chart for each cluster showing the top job titles
sns.set(style='whitegrid')

for cluster in job_title_counts['cluster'].unique():
    cluster_data = job_title_counts[job_title_counts['cluster'] == cluster]
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(x='count', y='job_title', data=cluster_data, color='steelblue')
    plt.title(f'Top 10 Job Titles - Cluster {class_names[cluster]}')
    plt.xlabel('Count')
    plt.ylabel('Job Title')
    plt.xticks(rotation=45, ha='right')  # Rotate x-tick labels for better readability
    plt.show()

first obtain the top 10 job titles with the highest counts using value_counts() and nlargest(10). Then, we filter the original data to include only the customers with those top job titles. Next, we group the filtered data by cluster and job title and count the occurrences. Finally, we create a bar chart for each cluster, showing the top job titles and their respective counts.

first group the data by cluster and job title and count the occurrences. Then, we pivot the data to have cluster as the columns and job title as the index. This allows us to create a bar plot with stacked bars representing the job title distribution for each cluster. We set the title, x-label, and y-label accordingly. The x-tick labels represent the different job titles, and the legend represents the cluster names.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data into a pandas DataFrame
df = pd.read_csv('data.csv')

# Get the top 10 job titles by count
top_job_titles = df['job_title'].value_counts().nlargest(10)

# Filter the data for customers with top job titles
df_top_jobs = df[df['job_title'].isin(top_job_titles.index)]

# Group the data by class_name and job_title and count the number of occurrences
job_title_counts = df_top_jobs.groupby(['class_name', 'job_title']).size().reset_index(name='count')

# Create a list of the top job titles in the desired order
desired_order = top_job_titles.index.tolist()

# Create a bar chart for each class_name showing the top job titles in the desired order
sns.set(style='whitegrid')

for class_name in job_title_counts['class_name'].unique():
    class_data = job_title_counts[job_title_counts['class_name'] == class_name]
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(x='count', y='job_title', data=class_data, color='steelblue',
                        order=desired_order)  # Specify the desired order of the bars
    plt.title(f'Top 10 Job Titles - {class_name}')
    plt.xlabel('Count')
    plt.ylabel('Job Title')
    plt.xticks(rotation=45, ha='right')  # Rotate x-tick labels for better readability
    plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data into a pandas DataFrame
df = pd.read_csv('data.csv')

# Get the top 10 job titles by count
top_job_titles = df['job_title'].value_counts().nlargest(10)

# Filter the data for customers with top job titles
df_top_jobs = df[df['job_title'].isin(top_job_titles.index)]

# Group the data by class_name and job_title and count the number of occurrences
job_title_counts = df_top_jobs.groupby(['class_name', 'job_title']).size().reset_index(name='count')

# Create a bar chart for each class_name showing the top job titles
sns.set(style='whitegrid')

for class_name in job_title_counts['class_name'].unique():
    class_data = job_title_counts[job_title_counts['class_name'] == class_name]
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(x='count', y='job_title', data=class_data, color='steelblue')
    plt.title(f'Top 10 Job Titles - {class_name}')
    plt.xlabel('Count')
    plt.ylabel('Job Title')
    plt.xticks(rotation=45, ha='right')  # Rotate x-tick labels for better readability
    plt.show()