In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

df_customers = pd.read_csv('Customers.csv')

print(df_customers)

       CustomerKey Prefix  FirstName LastName              Unnamed: 4  \
0            11000    MR.        JON     YANG            MR. JON YANG   
1            11001    MR.     EUGENE    HUANG        MR. EUGENE HUANG   
2            11002    MR.      RUBEN   TORRES        MR. RUBEN TORRES   
3            11003    MS.    CHRISTY      ZHU         MS. CHRISTY ZHU   
4            11004   MRS.  ELIZABETH  JOHNSON  MRS. ELIZABETH JOHNSON   
...            ...    ...        ...      ...                     ...   
18143        29479    MR.      TOMMY     TANG          MR. TOMMY TANG   
18144        29480   MRS.       NINA     RAJI          MRS. NINA RAJI   
18145        29481    MR.       IVAN     SURI           MR. IVAN SURI   
18146        29482    MR.    CLAYTON    ZHANG       MR. CLAYTON ZHANG   
18147        29483    MR.      JÉSUS  NAVARRO       MR. JÉSUS NAVARRO   

      Unnamed: 5 Unnamed: 6   BirthDate  Unnamed: 8 MaritalStatus Gender  \
0           YANG       YANG    4/8/1966        

In [8]:
df_customers['AnnualIncome'] = pd.to_numeric(df_customers['AnnualIncome'], errors='coerce')

print(df_customers['AnnualIncome'])

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
18143   NaN
18144   NaN
18145   NaN
18146   NaN
18147   NaN
Name: AnnualIncome, Length: 18148, dtype: float64


In [10]:
from datetime import datetime
df_customers['Age'] = (datetime.now() - pd.to_datetime(df_customers['BirthDate'])).dt.days // 365

print(df_customers['Age'])

0        57
1        58
2        58
3        56
4        55
         ..
18143    65
18144    63
18145    64
18146    65
18147    64
Name: Age, Length: 18148, dtype: int64


  df_customers['Age'] = (datetime.now() - pd.to_datetime(df_customers['BirthDate'])).dt.days // 365


In [12]:
# Bar chart for count of customers by MaritalStatus
plt.figure(figsize=(10, 6))
sns.countplot(x='MaritalStatus', data=df_customers)
plt.title('Count of Customers by MaritalStatus')
plt.savefig('marital_status_count.png')
plt.close()
print('marital_status_count.png')

marital_status_count.png


In [13]:
# Pie chart for proportion of customers by Gender
plt.figure(figsize=(8, 8))
df_customers['Gender'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Proportion of Customers by Gender')
plt.ylabel('')  # Remove the y-label as it's unnecessary for pie charts
plt.savefig('gender_proportion.png')
plt.close()
print('gender_proportion.png')

gender_proportion.png


In [14]:
# Histogram of AnnualIncome
df_customers['AnnualIncome'].hist(bins=20)
plt.title('Distribution of AnnualIncome')
plt.xlabel('AnnualIncome')
plt.ylabel('Frequency')
plt.savefig('annual_income_distribution.png')
plt.close()
print('annual_income_distribution.png')

annual_income_distribution.png


In [15]:
# Column chart for average number of TotalChildren by EducationLevel
plt.figure(figsize=(12, 6))
df_customers.groupby('EducationLevel')['TotalChildren'].mean().plot(kind='bar')
plt.title('Average Number of TotalChildren by EducationLevel')
plt.xlabel('EducationLevel')
plt.ylabel('Average TotalChildren')
plt.savefig('total_children_education_level.png')
plt.close()
print('total_children_education_level.png')

total_children_education_level.png


In [16]:
# Bar chart for average AnnualIncome across different Occupation categories
plt.figure(figsize=(12, 6))
df_customers.groupby('Occupation')['AnnualIncome'].mean().plot(kind='bar')
plt.title('Average AnnualIncome by Occupation')
plt.xlabel('Occupation')
plt.ylabel('Average AnnualIncome')
plt.savefig('average_annual_income_occupation.png')
plt.close()
print('average_annual_income_occupation.png')

average_annual_income_occupation.png


In [17]:
# Pie chart for count of customers in each Occupation category
plt.figure(figsize=(10, 10))
df_customers['Occupation'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Count of Customers in Each Occupation Category')
plt.ylabel('')  # Remove the y-label as it's unnecessary for pie charts
plt.savefig('occupation_category_count.png')
plt.close()
print('occupation_category_count.png')

occupation_category_count.png


In [18]:
# Histogram to explore the age distribution of customers
plt.figure(figsize=(10, 6))
df_customers['Age'].hist(bins=20)
plt.title('Age Distribution of Customers')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.savefig('age_distribution.png')
plt.close()
print('age_distribution.png')

age_distribution.png


In [19]:
# Convert BirthDate to datetime and calculate age
now = pd.Timestamp('now')
df_customers['BirthDate'] = pd.to_datetime(df_customers['BirthDate'], errors='coerce')
df_customers['Age'] = (now - df_customers['BirthDate']).astype('<m8[Y]')
print(df_customers['Age'])

0        57.0
1        58.0
2        58.0
3        56.0
4        55.0
         ... 
18143    65.0
18144    63.0
18145    64.0
18146    65.0
18147    64.0
Name: Age, Length: 18148, dtype: float64


  df_customers['BirthDate'] = pd.to_datetime(df_customers['BirthDate'], errors='coerce')


In [20]:
# Clean AnnualIncome by removing symbols and converting to float
df_customers['AnnualIncome'] = df_customers['AnnualIncome'].replace('[\$,]', '', regex=True).astype(float)
print(df_customers['AnnualIncome'])

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
18143   NaN
18144   NaN
18145   NaN
18146   NaN
18147   NaN
Name: AnnualIncome, Length: 18148, dtype: float64


In [21]:
# Group by Age and calculate average AnnualIncome
age_income = df_customers.groupby(pd.cut(df_customers['Age'], bins=np.arange(0, 100, 10)))['AnnualIncome'].mean().reset_index()
print(age_income)

        Age  AnnualIncome
0   (0, 10]           NaN
1  (10, 20]           NaN
2  (20, 30]           NaN
3  (30, 40]           NaN
4  (40, 50]           NaN
5  (50, 60]           NaN
6  (60, 70]           NaN
7  (70, 80]           NaN
8  (80, 90]           NaN
