# Occupation

### Introduction:

Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
import pandas as pd

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

### Step 3. Assign it to a variable called users.

In [2]:
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user"
users = pd.read_csv(url, sep="|")

In [3]:
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### Step 4. Discover what is the mean age per occupation

In [4]:
users.groupby("occupation").agg({"age": "mean"})

Unnamed: 0_level_0,age
occupation,Unnamed: 1_level_1
administrator,38.746835
artist,31.392857
doctor,43.571429
educator,42.010526
engineer,36.38806
entertainment,29.222222
executive,38.71875
healthcare,41.5625
homemaker,32.571429
lawyer,36.75


### Step 5. Discover the Male ratio per occupation and sort it from the most to the least

In [5]:
users.gender.value_counts()

gender
M    670
F    273
Name: count, dtype: int64

In [6]:
def ratio(ser):
    return (ser.value_counts()[0] / ser.value_counts().sum()) * 100

In [7]:
users.groupby("occupation").agg(male_ratio=("gender", ratio)).sort_values(
    by="male_ratio", ascending=False
)

Unnamed: 0_level_0,male_ratio
occupation,Unnamed: 1_level_1
doctor,100.0
engineer,97.014925
technician,96.296296
retired,92.857143
programmer,90.909091
executive,90.625
scientist,90.322581
entertainment,88.888889
homemaker,85.714286
lawyer,83.333333


In [8]:
users.groupby("occupation").gender.agg(
    lambda grp_: (grp_.value_counts()[0] / grp_.value_counts().sum()) * 100
).sort_values(ascending=False)

occupation
doctor           100.000000
engineer          97.014925
technician        96.296296
retired           92.857143
programmer        90.909091
executive         90.625000
scientist         90.322581
entertainment     88.888889
homemaker         85.714286
lawyer            83.333333
salesman          75.000000
educator          72.631579
student           69.387755
healthcare        68.750000
other             65.714286
marketing         61.538462
writer            57.777778
librarian         56.862745
none              55.555556
administrator     54.430380
artist            53.571429
Name: gender, dtype: float64

### Step 6. For each occupation, calculate the minimum and maximum ages

In [9]:
users.groupby("occupation").age.agg(["min", "max"])

Unnamed: 0_level_0,min,max
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
administrator,21,70
artist,19,48
doctor,28,64
educator,23,63
engineer,22,70
entertainment,15,50
executive,22,69
healthcare,22,62
homemaker,20,50
lawyer,21,53


### Step 7. For each combination of occupation and gender, calculate the mean age

In [10]:
users.groupby(["occupation", "gender"]).age.agg("mean")

occupation     gender
administrator  F         40.638889
               M         37.162791
artist         F         30.307692
               M         32.333333
doctor         M         43.571429
educator       F         39.115385
               M         43.101449
engineer       F         29.500000
               M         36.600000
entertainment  F         31.000000
               M         29.000000
executive      F         44.000000
               M         38.172414
healthcare     F         39.818182
               M         45.400000
homemaker      F         34.166667
               M         23.000000
lawyer         F         39.500000
               M         36.200000
librarian      F         40.000000
               M         40.000000
marketing      F         37.200000
               M         37.875000
none           F         36.500000
               M         18.600000
other          F         35.472222
               M         34.028986
programmer     F         32.16666

### Step 8.  For each occupation present the percentage of women and men

In [11]:
count_df = users.groupby("occupation").agg(count=("gender", "count"))

In [12]:
count_df.head()

Unnamed: 0_level_0,count
occupation,Unnamed: 1_level_1
administrator,79
artist,28
doctor,7
educator,95
engineer,67


In [13]:
gender_count_df = users.groupby("occupation").agg(count=("gender", "value_counts"))

# since the .div() function will align both the indexes and columns we need to have the same column name

In [14]:
gender_count_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
occupation,gender,Unnamed: 2_level_1
administrator,M,43
administrator,F,36
artist,M,15
artist,F,13
doctor,M,7


In [15]:
gender_count_df.assign(
    percentage=gender_count_df.div(count_df, level="occupation") * 100
)

Unnamed: 0_level_0,Unnamed: 1_level_0,count,percentage
occupation,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
administrator,M,43,54.43038
administrator,F,36,45.56962
artist,M,15,53.571429
artist,F,13,46.428571
doctor,M,7,100.0
educator,M,69,72.631579
educator,F,26,27.368421
engineer,M,65,97.014925
engineer,F,2,2.985075
entertainment,M,16,88.888889
