In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

data = pd.read_csv('dataset.csv')
data.info()
pd.options.display.max_rows = None
pd.options.display.max_columns = None



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1026 entries, 0 to 1025
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ID                1026 non-null   int64 
 1   Marital Status    1026 non-null   object
 2   Gender            1026 non-null   object
 3    Income           1025 non-null   object
 4   Children          1026 non-null   object
 5   Education         1017 non-null   object
 6   Occupation        1024 non-null   object
 7   Home Owner        1026 non-null   object
 8   Cars              1026 non-null   int64 
 9   Commute Distance  1026 non-null   object
 10  Region            1024 non-null   object
 11  Age               1026 non-null   int64 
 12  Purchased Bike    1026 non-null   object
dtypes: int64(3), object(10)
memory usage: 104.3+ KB


In [3]:
data.sample(10)

Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
721,13287,S,M,"$110,000",4,Bachelors,Management,Yes,4,5-10 Miles,North America,42,Yes
195,25559,S,M,"$20,000",0,Bachelors,Clerical,Yes,0,0-1 Miles,Pacific,25,Yes
244,19057,M,F,"$120,000",3,Bachelors,Management,No,2,10+ Miles,Europe,52,Yes
344,17848,S,M,"$30,000",0,Partial College,Clerical,No,1,2-5 Miles,Europe,31,Yes
214,25553,M,M,"$30,000",1,Bachelors,Clerical,Yes,0,0-1 Miles,Europe,65,Yes
63,16185,S,M,"$60,000",4,Bachelors,Professional,Yes,3,10+ Miles,Pacific,41,No
938,27740,M,F,"$40,000",0,High School,Skilled Manual,Yes,2,5-10 Miles,North America,27,No
633,22088,M,F,"$130,000",1,Bachelors,Management,Yes,2,0-1 Miles,North America,45,Yes
625,22127,M,M,"$60,000",3,Graduate Degree,Management,Yes,2,1-2 Miles,North America,67,No
308,28758,M,M,"$40,000",2,Partial College,Clerical,Yes,1,1-2 Miles,Europe,35,Yes


Exploring the data 

In [4]:
# the first 5 rows of the data
print(data.head()) 

# summary information about the data including data types and non-null values
print(data.info())


# the column names of the data
print(data.columns)

# descripe statistics of the data
print(data.describe(), "\n")

      ID Marital Status Gender    Income  Children        Education  \
0  12496              M      F   $40,000         1        Bachelors   
1  24107              M      M   $30,000         3  Partial College   
2  14177        married   Male   $80,000         5  Partial College   
3  24381              S   Male   $70,000         0        Bachelors   
4  25597              S      M        NaN        0        Bachelors   

       Occupation Home Owner  Cars Commute Distance   Region  Age  \
0  Skilled Manual        Yes     0        0-1 Miles   Europe   42   
1        Clerical        Yes     1        0-1 Miles   Europe   43   
2    Professional         No     2        2-5 Miles   Europe   60   
3    Professional        Yes     1       5-10 Miles  Pacific   41   
4        Clerical         No     0        0-1 Miles   Europe   36   

  Purchased Bike  
0              N  
1             No  
2             No  
3              Y  
4            Yes  
<class 'pandas.core.frame.DataFrame'>
RangeI

In [5]:
# Function to fix 'Marital Status' column values

def fix_Marital(status):
    return "married" if str(status).strip().lower() in ["m", "married"] else "single"


data["Marital Status"]=data["Marital Status"].apply(fix_Marital) # apply the function to the column
# check the unique values in the 'Marital Status' column
print(data["Marital Status"].unique())

# we will remove $ and , from the 'Income' column
data[' Income '] = data[' Income '].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False)
# we must convert the 'Income' column to numeric type for analysis after cleaning the data not before

# convert income data type to numeric 
data[' Income '] = pd.to_numeric(data[' Income '], errors='coerce')

print (data[' Income '].head(10)) # check the first 5 rows of the 'Income' column

['married' 'single']
0     40000.0
1     30000.0
2     80000.0
3     70000.0
4         NaN
5     10000.0
6    160000.0
7     40000.0
8     20000.0
9    120000.0
Name:  Income , dtype: float64


In [6]:
data.sample(3) # check the data types of the columns after cleaning

Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
689,16725,married,M,30000.0,0,High School,Skilled Manual,Yes,2,5-10 Miles,North America,26,No
90,26886,single,F,30000.0,0,Partial College,Clerical,No,1,0-1 Miles,Europe,29,Yes
638,18949,single,M,70000.0,0,Graduate Degree,Management,Yes,2,5-10 Miles,North America,74,Yes


In [7]:
# Function to standardize 'Children' column values
def fix_Children(a):
    lookup = {
        "zero": 0,          "0": 0,
        "one": 1,           "1": 1,
        "two": 2,           "2": 2,
        "three": 3,         "3": 3,
        "four": 4,          "4": 4,
        "five": 5,           "5": 5,
        "six": 6,             "6": 6
    }
    return lookup.get(str(a).strip().lower(), 0) # default to 0 if not found

    
data['Children' ] = data['Children'].apply(fix_Children).astype(int)

print(data['Children'].unique()) # check the unique values in the 'Children' column




[1 3 5 0 2 4]


In [8]:
data.sample(5) # check the first 10 rows of the data after cleaning

Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
1022,19273,married,F,20000.0,2,Partial College,Manual,Yes,0,0-1 Miles,Europe,63,No
526,15382,married,F,110000.0,1,Bachelors,Management,Yes,2,1-2 Miles,North America,44,No
737,18504,married,M,70000.0,2,Partial High School,Skilled Manual,No,2,1-2 Miles,North America,49,No
985,13920,single,F,50000.0,4,Bachelors,Skilled Manual,Yes,2,0-1 Miles,North America,42,No
922,16895,married,F,40000.0,3,Partial College,Professional,No,2,1-2 Miles,North America,54,Yes


In [9]:

# Function to standardize 'Purchased Bike' column values
def fix_PurchasedBike(a):
    a_lower = str(a).lower().strip()  # Convert to string, lowercase, and strip whitespace
    if a_lower in ["y", "yes"]:
        return 1
    else:
        return 0  # Default to 0 for any other value even if there is nothing in the cell

data['Purchased Bike'] = data['Purchased Bike'].apply(fix_PurchasedBike)





In [10]:
data.sample(20) # check the first 10 rows of the data after cleaning

Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
80,20828,married,F,30000.0,4,Graduate Degree,Clerical,Yes,0,0-1 Miles,Europe,45,1
791,18363,married,M,40000.0,0,High School,Skilled Manual,Yes,2,5-10 Miles,North America,28,1
528,11935,single,F,30000.0,0,Partial College,Skilled Manual,Yes,1,5-10 Miles,North America,28,0
93,15608,single,F,30000.0,0,Partial College,Clerical,No,1,2-5 Miles,Europe,33,0
510,18613,single,M,70000.0,0,Bachelors,Professional,No,1,2-5 Miles,North America,37,1
671,22252,single,F,60000.0,1,Graduate Degree,Professional,Yes,0,2-5 Miles,North America,36,1
335,24369,married,M,80000.0,5,Graduate Degree,Management,No,2,0-1 Miles,Pacific,39,0
1010,27183,single,M,40000.0,2,Partial College,Clerical,Yes,1,1-2 Miles,Europe,35,1
810,20376,single,F,70000.0,3,Graduate Degree,Management,Yes,2,5-10 Miles,North America,52,1
267,13133,single,M,100000.0,5,Bachelors,Professional,Yes,1,5-10 Miles,Pacific,47,1


In [11]:

# Function to standardize 'Home Owner' column values
def fix_home_owner(a):
    a_lower = a.lower().strip()  # Convert to lowercase and strip whitespace
    if a_lower in ["y", "yes", "true", "1"]:
        return 1
    else:
        return 0

data['Home Owner'] = data['Home Owner'].apply(fix_home_owner).astype(int)
data.sample(10) # check the first 10 rows of the data after cleaning



Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
207,28729,single,F,20000.0,0,Partial High School,Manual,1,2,1-2 Miles,Europe,26,1
736,19634,married,M,40000.0,0,High School,Skilled Manual,1,1,5-10 Miles,North America,31,0
338,15926,single,F,120000.0,3,High School,Professional,1,4,5-10 Miles,Europe,50,1
490,11738,married,M,60000.0,4,Bachelors,Professional,1,0,2-5 Miles,North America,46,0
974,17462,married,M,70000.0,3,Graduate Degree,Management,1,2,5-10 Miles,North America,53,1
138,24273,married,F,20000.0,2,Partial High School,Clerical,1,2,5-10 Miles,Pacific,55,1
474,27585,married,F,90000.0,2,Bachelors,Professional,0,0,0-1 Miles,Pacific,36,1
596,20343,married,F,90000.0,4,Partial College,Professional,1,1,1-2 Miles,North America,45,0
578,15313,married,M,60000.0,4,Bachelors,Management,1,2,2-5 Miles,North America,59,0
233,24174,married,M,20000.0,0,Bachelors,Clerical,1,0,0-1 Miles,Pacific,27,1


In [12]:
data.sample(10) # check the first 10 rows of the data after cleaning

Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
808,17668,single,M,30000.0,2,High School,Skilled Manual,1,2,1-2 Miles,North America,50,1
119,12871,single,F,30000.0,0,Partial College,Clerical,0,1,2-5 Miles,Europe,29,0
605,17458,single,M,70000.0,3,High School,Professional,1,0,5-10 Miles,North America,52,1
899,28192,married,F,70000.0,5,Graduate Degree,Professional,1,3,10+ Miles,North America,46,0
162,29181,single,F,60000.0,2,Bachelors,Professional,0,1,0-1 Miles,Pacific,38,1
592,18391,single,F,80000.0,5,Partial College,Professional,1,2,5-10 Miles,North America,44,0
1007,20870,single,F,10000.0,2,High School,Manual,1,1,0-1 Miles,Europe,38,1
537,14900,married,F,40000.0,1,Partial College,Clerical,1,1,1-2 Miles,North America,49,1
552,14417,single,M,60000.0,3,High School,Professional,1,2,10+ Miles,North America,54,1
503,20339,married,F,130000.0,1,Bachelors,Management,1,4,2-5 Miles,North America,44,1


In [13]:
# Function to standardize 'Gender' column values
def fix_gender(a):
    a_lower = a.lower().strip()  # Convert to lowercase and strip whitespace
    if a_lower in ["m", "male"]:
        return "male"
    else:
        return "female"

# Applying the fix_gender function to 'Gender' column
data['Gender'] = data['Gender'].apply(fix_gender)

In [14]:
data.sample(5)

Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
398,27771,single,male,30000.0,1,Bachelors,Clerical,1,1,1-2 Miles,Europe,39,1
958,21940,married,male,90000.0,5,Graduate Degree,Professional,1,0,0-1 Miles,North America,47,1
279,16390,single,male,30000.0,1,Bachelors,Clerical,0,0,0-1 Miles,Europe,38,1
456,26385,single,male,120000.0,3,High School,Professional,0,4,5-10 Miles,Europe,50,0
170,17203,married,female,130000.0,4,Partial College,Professional,1,4,5-10 Miles,Europe,61,1


In [15]:
# Filling missing values with the mode (most common value) of each column
for col in data.columns:
     data[col] = data[col].fillna(data[col].mode()[0])

# Counting the number of duplicated rows in the dataframe
print("Number of duplicated data:", data.duplicated().sum(), "\n")



Number of duplicated data: 21 



In [16]:


# Removing duplicated rows from the dataframe
data = data.drop_duplicates()

#  information about the dataframe after removing duplicates
print(data.info())




<class 'pandas.core.frame.DataFrame'>
Index: 1005 entries, 0 to 1024
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                1005 non-null   int64  
 1   Marital Status    1005 non-null   object 
 2   Gender            1005 non-null   object 
 3    Income           1005 non-null   float64
 4   Children          1005 non-null   int64  
 5   Education         1005 non-null   object 
 6   Occupation        1005 non-null   object 
 7   Home Owner        1005 non-null   int64  
 8   Cars              1005 non-null   int64  
 9   Commute Distance  1005 non-null   object 
 10  Region            1005 non-null   object 
 11  Age               1005 non-null   int64  
 12  Purchased Bike    1005 non-null   int64  
dtypes: float64(1), int64(6), object(6)
memory usage: 109.9+ KB
None


In [17]:
print(data.duplicated().sum()) # check the number of duplicated rows after removing duplicates

0


In [18]:

# 1. What is the average income of the employees  in this dataset?
avg_income = data[' Income '].mean()
print("Average income: ",avg_income)


# 2. Which percentage of employees earn more than $50,000 ?
percentage=(data[' Income '] > 50000).sum() 


print("percentage of employees earn more than $50,000: ",percentage/len(data) *100,"% \n")




# 3. Which percentage of employees have purchased a bike?
percentage=(data['Purchased Bike']==1).sum() / len(data) * 100
print("percentage of employees have purchased a bike: ",percentage,"% \n")


Average income:  56218.90547263682
percentage of employees earn more than $50,000:  52.537313432835816 % 

percentage of employees have purchased a bike:  48.159203980099505 % 



In [19]:

# 4. What is the most common occupation in this dataset?
common_occupation = data['Occupation'].mode()
print("Most common occupation: ",common_occupation,"\n")



# 5. How many employees in this dataset have no children?
num_no_children = (data['Children'] == 0).sum()
print("Number of  with no children:",num_no_children,"\n")



# 6. What is the average number of cars owned by employeesls in this dataset?
avg_num_cars = data['Cars'].mean()
print("Average number of cars: ",avg_num_cars,"\n")


Most common occupation:  0    Professional
Name: Occupation, dtype: object 

Number of  with no children: 282 

Average number of cars:  1.4398009950248756 



In [20]:
data.sample(10) # check the first 10 rows of the data after cleaning

Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
389,13122,married,female,80000.0,0,Bachelors,Professional,1,1,1-2 Miles,Pacific,41,1
557,24725,married,female,40000.0,3,Partial College,Clerical,1,0,1-2 Miles,North America,31,0
252,12666,single,male,60000.0,0,Bachelors,Professional,0,4,2-5 Miles,Pacific,31,0
535,23893,married,male,50000.0,3,Bachelors,Skilled Manual,1,3,10+ Miles,North America,41,0
619,15814,single,female,40000.0,0,High School,Skilled Manual,1,1,5-10 Miles,North America,30,0
403,17882,married,male,20000.0,1,Graduate Degree,Clerical,1,0,0-1 Miles,Europe,44,0
655,26236,married,female,40000.0,3,Partial College,Clerical,1,1,0-1 Miles,North America,31,0
394,17793,married,female,40000.0,0,Bachelors,Clerical,1,0,0-1 Miles,Europe,38,1
298,14189,married,female,90000.0,4,High School,Professional,0,2,2-5 Miles,Europe,54,1
262,28468,married,female,10000.0,2,Partial College,Manual,1,0,1-2 Miles,Europe,51,0


In [21]:

# 7. How many employees in this dataset live in the Pacific region?
num_pacific_region = (data['Region'] == 'Pacific').sum()
print("Number of employees live in  Pacific region: ",num_pacific_region,"\n")


# 8. What is the average age  in this dataset?
print("the average age :",data['Age'].mean(),"\n")


a=0

    
for i,n in zip(data["Home Owner"] , data["Commute Distance"]):
    if (n == '5-10 Miles') and (i == 1):
        a+=1

#9. what is the percentage in this dataset who have a commute distance of 5-10 miles and own a home?
print("the percentage in this dataset who have a commute distance of 5-10 miles and own a home: ",a/len(data["Home Owner"])*100,"%\n")


Number of employees live in  Pacific region:  193 

the average age : 44.14029850746269 

the percentage in this dataset who have a commute distance of 5-10 miles and own a home:  15.522388059701491 %



In [22]:

#10. What is the most common commute distance in this dataset?
print("the most common commute distance",data["Commute Distance"].mode().iloc[0],"\n") 


# 11. What is the most common gender in this dataset?

print("the most common commute gender:",data["Gender"].mode().iloc[0],"\n")

#12.what is the average income of male employees
male_income_mean = data[data['Gender']=='male'][' Income '].mean()
print("the average income of male individuals:",male_income_mean,"\n")


the most common commute distance 0-1 Miles 

the most common commute gender: male 

the average income of male individuals: 57957.198443579764 



In [23]:

#13.what is the average income of female employees
female_income_mean = data[data['Gender']=='female'][' Income '].mean()
print("the average income of female individuals:",female_income_mean,"\n")


#14. What is the percentage of male employees in this dataset?
percentage_male = (data['Gender'] =="male").sum() / len(data) * 100
print("the percentage of male :",percentage_male,"% \n")


#15.What is the percentage of female employees in this dataset?
print("the percentage of female :",100-percentage_male,"% \n")

the average income of female individuals: 54399.18533604888 

the percentage of male : 51.14427860696517 % 

the percentage of female : 48.85572139303483 % 



In [24]:

#16.How many employee in this dataset have 2 or more cars and income <50000?
print("employee in this dataset have 2 or more cars and income <50000: ",((data['Cars'] >= 2) & (data[' Income '] < 50000)).sum(),"employee\n")

#17.What is the percentage of indiviuals in this dataset who are home owners and have purchased a bike?
percentage=(((data["Home Owner"]==1)& (data["Purchased Bike"]==1)).sum())/len(data) *100
print("percentage of individuals in this dataset who are home owners and have purchased a bike : ",percentage,"%\n")

#18.What is the highest income in this dataset?
print("highest income: ",data[" Income "].max(),"\n")


#19.How many employee in this dataset have a partial college education?

print("partial college education: ",(data['Education']=="Partial College").sum(),"employee \n")

employee in this dataset have 2 or more cars and income <50000:  177 employee

percentage of individuals in this dataset who are home owners and have purchased a bike :  32.43781094527363 %

highest income:  170000.0 

partial college education:  264 employee 



In [25]:
#20.How many employee in this dataset are over 50 years old?
employe=(data["Age"]>50).sum()
print("employee in this dataset are over 50 years old : ",employe," employee\n")
#21.What is the percentage of male employees in over 50 years old?
employe=((data["Age"]>50) & (data["Gender"]==1)).sum()
print(" number of male employees in over 50 years old: ",employe,"employee \n")

#22.How many employee in this dataset have a skilled manual occupation?
employe=(data["Occupation"]=="Skilled Manual").sum()
print("employee in this dataset have a skilled manual occupation",employe,"employee \n")


#23.what is the ID of all rows with the highest income?
max_income = data[' Income '].max()
id = data.loc[data[' Income '] == max_income, 'ID']
print("id for the highest income: ",list(id),"\n")

employee in this dataset are over 50 years old :  277  employee

 number of male employees in over 50 years old:  0 employee 

employee in this dataset have a skilled manual occupation 255 employee 

id for the highest income:  [11434, 16009] 



In [26]:

#24.How many employee in this dataset have a graduate degree?
num_graduate = ((data['Education'] == 'Graduate') | (data['Education'] == 'Bachelors')).sum()
print("employee in this dataset have a graduate degree: ",num_graduate,"employee \n")
 
#25.what is the avrege income in Europe?
avg=data.loc[data['Region'] == 'Europe', ' Income '].mean()
print("the avrege income in Europe:",avg,"$ \n")


#26.what is the avrege income in pacific?
avg=data.loc[data['Region'] == 'Pacific', ' Income '].mean()
print("the avrege income in pacific:",avg,"$ \n")


employee in this dataset have a graduate degree:  314 employee 

the avrege income in Europe: 40728.47682119205 $ 

the avrege income in pacific: 63575.12953367876 $ 



In [27]:

#27.What is the most common Marital Status in dataset?
status=data['Marital Status'].mode().iloc[0]
print("most common marital status: ",status,"\n")


#28.what is the avrege income for single ?
avg_status=data[data["Marital Status"]=="single"] [" Income "].mean()
print("the avrege income for single:",avg_status,"$\n")



#28.what is the avrege income for married ?
avg_status=data[data["Marital Status"]=="married"] [" Income "].mean()

print("the avrege income for married:",avg_status,"$\n")



most common marital status:  married 

the avrege income for single: 53755.36480686695 $

the avrege income for married: 58348.794063079775 $



In [28]:



# Calculate mean income grouped by number of cars
income_by_cars = data.groupby("Cars")[" Income "].mean()

# Plotting the bar chart
fig = px.bar(x=income_by_cars.index, y=income_by_cars.values, labels={'x': 'Number of Cars', 'y': 'Mean Income'}, title="Mean Income by Number of Cars Owned", color_discrete_sequence=['skyblue'])
fig.show()



income_by_cars = data.groupby("Cars")[' Income '].mean()
fig = px.bar(x = income_by_cars.index , y = income_br_cars.values , labels = { 'x' : 'cars number ' , 'y' : 'mean income' } )

In [29]:
fig2 = px.box(data, x=" Income ", title="Income Distribution", color_discrete_sequence=["blue"])
fig2.update_layout(xaxis_title="Income", yaxis_title="Count")
fig2.show()

In [30]:
# Box plot for age using plotly express
fig = px.box(data, y="Age", title="Age Distribution", color_discrete_sequence=["red"])
fig.update_layout(yaxis_title="Age", xaxis_title="Value")
fig.show()


In [31]:
x = pd.pivot_table(index='Education', columns='Occupation', values='ID', aggfunc='count', data=data)
print(x)
fig = px.imshow(x, labels=dict(x="Occupation", y="Education", color="Count"), title="Education Level by Occupation", text_auto=True)
fig.show()


Occupation           Clerical  Management  Manual  Professional  \
Education                                                         
Bachelors                51.0        98.0     5.0          94.0   
Graduate Degree          23.0        58.0     5.0          45.0   
High School               4.0        11.0    45.0          56.0   
Partial College          77.0         5.0    35.0          79.0   
Partial High School      24.0         NaN    31.0           4.0   

Occupation           Skilled Manual  
Education                            
Bachelors                      66.0  
Graduate Degree                40.0  
High School                    64.0  
Partial College                68.0  
Partial High School            17.0  


In [32]:
# Plotting the count of each occupation
fig = px.bar(data, x="Occupation", title="Count of Each Occupation", color_discrete_sequence=["purple"])
fig.update_layout(xaxis_title="Occupation", yaxis_title="Count")
fig.show()



In [33]:
# Create a heatmap for correlation using plotly express    #### imshow = heatmap
corr = data.corr(numeric_only=True)

fig = px.imshow(corr, text_auto=True, color_continuous_scale='Viridis', title="Correlation Heatmap")
fig.update_layout(xaxis_title="Features", yaxis_title="Features")
fig.show()


In [34]:
fig = px.scatter(data, x="Age", y=" Income ", title="Scatter Plot of Age vs. Income", labels={"Age": "Age", " Income ": "Income"} , color_discrete_sequence=["green"])
fig.show()


Demographic Distribution

🔹 توزيع العمر، النوع، الحالة الاجتماعية

In [35]:
print("Age Distribution:\n", data['Age'].describe())
print("\nGender Distribution:\n", data['Gender'].value_counts(normalize=True) * 100)
print("\nMarital Status Distribution:\n", data['Marital Status'].value_counts(normalize=True) * 100)



Age Distribution:
 count    1005.000000
mean       44.140299
std        11.371892
min        25.000000
25%        35.000000
50%        43.000000
75%        52.000000
max        89.000000
Name: Age, dtype: float64

Gender Distribution:
 Gender
male      51.144279
female    48.855721
Name: proportion, dtype: float64

Marital Status Distribution:
 Marital Status
married    53.631841
single     46.368159
Name: proportion, dtype: float64


🔹  number of Homeowners vs Non-homeowners:



In [36]:
print("\nHome Owner Count:\n", data['Home Owner'].value_counts())
print("\nHome Owner Percentage:\n", data['Home Owner'].value_counts(normalize=True) * 100)



Home Owner Count:
 Home Owner
1    685
0    320
Name: count, dtype: int64

Home Owner Percentage:
 Home Owner
1    68.159204
0    31.840796
Name: proportion, dtype: float64


🔹 Average income by region:


In [37]:
print("\nAverage Income by Region:\n", data.groupby('Region')[' Income '].mean())


Average Income by Region:
 Region
Europe           40728.476821
North America    62607.843137
Pacific          63575.129534
Name:  Income , dtype: float64


In [38]:
print("\nMost Common Education Level:\n", data['Education'].value_counts().head(1))
print("\nMost Common Occupation:\n", data['Occupation'].value_counts().head(1))
print("\nAverage Number of Cars Owned by Income Level:\n", data.groupby(' Income ')['Cars'].mean())




Most Common Education Level:
 Education
Bachelors    314
Name: count, dtype: int64

Most Common Occupation:
 Occupation
Professional    278
Name: count, dtype: int64

Average Number of Cars Owned by Income Level:
  Income 
10000.0     1.041096
20000.0     1.133333
30000.0     1.156716
40000.0     1.129032
50000.0     0.875000
60000.0     1.263158
70000.0     1.483871
80000.0     1.666667
90000.0     1.589744
100000.0    2.724138
110000.0    3.250000
120000.0    3.166667
130000.0    2.937500
150000.0    3.500000
160000.0    3.000000
170000.0    2.000000
Name: Cars, dtype: float64


🔹 Commute Distance and its variability by region


In [39]:
print("\nMost Common Commute Distance:\n", data['Commute Distance'].value_counts().head(1))
print("\nCommute Distance Distribution by Region:\n", data.groupby('Region')['Commute Distance'].agg(lambda x: x.value_counts().index[0]))
#.agg means to apply a function to each group in region , lambda means to apply a function to each group in region , x.value_counts().index[0] means to get the most common value in each group


Most Common Commute Distance:
 Commute Distance
0-1 Miles    368
Name: count, dtype: int64

Commute Distance Distribution by Region:
 Region
Europe            0-1 Miles
North America     0-1 Miles
Pacific          5-10 Miles
Name: Commute Distance, dtype: object


In [40]:
print("\nIncome Stats:\n", data[' Income '].agg(['mean', 'median', 'std']))
print("\nAge Stats:\n", data['Age'].agg(['mean', 'median', 'std']))
print("\nChildren Stats:\n", data['Children'].agg(['mean', 'median', 'std']))



Income Stats:
 mean      56218.905473
median    60000.000000
std       30806.422999
Name:  Income , dtype: float64

Age Stats:
 mean      44.140299
median    43.000000
std       11.371892
Name: Age, dtype: float64

Children Stats:
 mean      1.893532
median    2.000000
std       1.626354
Name: Children, dtype: float64


🔹 How many have more than three children? And what is their average age?


In [41]:
more_than_three = data[data['Children'] > 3] 
print("\nEmployees with more than 3 children:\n",len(more_than_three))
print("Average age of people with more than 3 children:", more_than_three['Age'].mean())



Employees with more than 3 children:
 207
Average age of people with more than 3 children: 50.47826086956522
