In [1]:
import warnings
warnings.filterwarnings("ignore")   #to avoid warnings

import pandas as pd
from scipy.stats import norm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')

In [2]:
df = pd.read_csv("C:\\Users\\91863\\Desktop\\ML\\data\\CardioGoodFitness.csv")
df.head()

Unnamed: 0,Product,Age,Gender,Education,MaritalStatus,Usage,Fitness,Income,Miles
0,TM195,18,Male,14,Single,3,4,29562,112
1,TM195,19,Male,15,Single,2,3,31836,75
2,TM195,19,Female,14,Partnered,4,3,30699,66
3,TM195,19,Male,12,Single,3,3,32973,85
4,TM195,20,Male,13,Partnered,4,2,35247,47


# Data Description

product = Categorical unordered

Age = Numerical ordinal,continous

Gender = Categorical unordered

Education = Numeric ordinal, discrete

MaritalStatus = Categorical unordered

Usages = Numeric ordinal,discrete

Fitness = Numerical ordinal, discrete

Income = Numeric ordinal, continous

In [3]:
df.shape

(180, 9)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Product        180 non-null    object
 1   Age            180 non-null    int64 
 2   Gender         180 non-null    object
 3   Education      180 non-null    int64 
 4   MaritalStatus  180 non-null    object
 5   Usage          180 non-null    int64 
 6   Fitness        180 non-null    int64 
 7   Income         180 non-null    int64 
 8   Miles          180 non-null    int64 
dtypes: int64(6), object(3)
memory usage: 12.8+ KB


In [5]:
df.describe()

Unnamed: 0,Age,Education,Usage,Fitness,Income,Miles
count,180.0,180.0,180.0,180.0,180.0,180.0
mean,28.788889,15.572222,3.455556,3.311111,53719.577778,103.194444
std,6.943498,1.617055,1.084797,0.958869,16506.684226,51.863605
min,18.0,12.0,2.0,1.0,29562.0,21.0
25%,24.0,14.0,3.0,3.0,44058.75,66.0
50%,26.0,16.0,3.0,3.0,50596.5,94.0
75%,33.0,16.0,4.0,4.0,58668.0,114.75
max,50.0,21.0,7.0,5.0,104581.0,360.0


In [6]:
#check NaN value
df.isna().sum()

Product          0
Age              0
Gender           0
Education        0
MaritalStatus    0
Usage            0
Fitness          0
Income           0
Miles            0
dtype: int64

In [7]:
#visualize the quantities of unique values in each features
unique = {}
for col in df:
    unique[df[col].name] = df[col].nunique()
    
unique_val = pd.DataFrame.from_dict(unique, orient = 'index')
unique_val.columns = ['count']
unique_val

Unnamed: 0,count
Product,3
Age,32
Gender,2
Education,8
MaritalStatus,2
Usage,6
Fitness,5
Income,62
Miles,37


In [10]:
#playing around to find the total amount of income per gender
data_count = df.groupby(['Product', 'Gender'])['Income'].sum()
data_count_i = data_count.reset_index()

data_count_bar = data_count_i.groupby(['Gender'])['Income'].sum()
data_count_bar

Gender
Female    3786997
Male      5882527
Name: Income, dtype: int64

In [12]:
#Playing around to find the total count of male and female per gender.
data_count = df.groupby(['Product','Gender'])['Income'].count()
data_count_i = data_count.reset_index()
data_count_i.columns = ['Product', 'Gender', 'Count']
data_count_i

data_count_bar_sum = data_count_i.groupby(['Gender'])['Count'].sum()
data_count_bar_sum

Gender
Female     76
Male      104
Name: Count, dtype: int64

In [13]:
#Female average income
data_count_bar['Female']/data_count_bar_sum['Female']

49828.90789473684

In [14]:
#male average income
data_count_bar['Female']/data_count_bar_sum['Female']

49828.90789473684

## Understanding customer profile and performing univariate and multi-variate analysis

In [18]:
#Filtering the data based on products types, building a dictionary with it .
product_TM195 = df[df['Product'] == 'TM195']
product_TM498 = df[df['Product'] == 'TM498']
product_TM798 = df[df['Product'] == 'TM798']

dict_products = {'TM198': product_TM195, 'TM498' : product_TM498, 'TM798' : product_TM798}

In [19]:
for items in dict_products:
    print(items,'\n')
    print(dict_products[items].groupby(['Gender'])['Gender'].count())
    print('\n')
    

TM198 

Gender
Female    40
Male      40
Name: Gender, dtype: int64


TM498 

Gender
Female    29
Male      31
Name: Gender, dtype: int64


TM798 

Gender
Female     7
Male      33
Name: Gender, dtype: int64


