# Occupation

### Introduction:

Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

### Step 3. Assign it to a variable called users.

In [None]:
users = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user', sep='|')
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [None]:
users.dtypes

user_id        int64
age            int64
gender        object
occupation    object
zip_code      object
dtype: object

### Step 4. Discover what is the mean age per occupation

In [None]:
users.groupby('occupation').age.mean().sort_values().round(1)

occupation
student          22.1
none             26.6
entertainment    29.2
artist           31.4
homemaker        32.6
programmer       33.1
technician       33.1
other            34.5
scientist        35.5
salesman         35.7
writer           36.3
engineer         36.4
lawyer           36.8
marketing        37.6
executive        38.7
administrator    38.7
librarian        40.0
healthcare       41.6
educator         42.0
doctor           43.6
retired          63.1
Name: age, dtype: float64

### Step 5. Discover the Male ratio per occupation and sort it from the most to the least

In [None]:
users.assign(
    male = lambda x: x.gender == 'M'
).groupby('occupation')\
.male.mean()\
.sort_values(ascending = False).round(2)

occupation
doctor           1.00
engineer         0.97
technician       0.96
retired          0.93
programmer       0.91
executive        0.91
scientist        0.90
entertainment    0.89
lawyer           0.83
salesman         0.75
educator         0.73
student          0.69
other            0.66
marketing        0.62
writer           0.58
none             0.56
administrator    0.54
artist           0.54
librarian        0.43
healthcare       0.31
homemaker        0.14
Name: male, dtype: float64

In [None]:
# my attempt at their solution

def gender_to_numeric(aseries):
  if aseries == 'M':
    return 1
  else:
    return 0

users_temp = users # copy existing df
users_temp['gender_n'] = users['gender'].apply(gender_to_numeric) # add numeric column for gender
users_temp.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code,gender_n
0,1,24,M,technician,85711,1
1,2,53,F,other,94043,0
2,3,23,M,writer,32067,1
3,4,24,M,technician,43537,1
4,5,33,F,other,15213,0


In [None]:
users_temp.groupby('occupation')['gender_n'].mean().sort_values(ascending=False).round(2)

occupation
doctor           1.00
engineer         0.97
technician       0.96
retired          0.93
programmer       0.91
executive        0.91
scientist        0.90
entertainment    0.89
lawyer           0.83
salesman         0.75
educator         0.73
student          0.69
other            0.66
marketing        0.62
writer           0.58
none             0.56
administrator    0.54
artist           0.54
librarian        0.43
healthcare       0.31
homemaker        0.14
Name: gender_n, dtype: float64

### Step 6. For each occupation, calculate the minimum and maximum ages

In [None]:
users.groupby('occupation')['age'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
administrator,21,70
artist,19,48
doctor,28,64
educator,23,63
engineer,22,70
entertainment,15,50
executive,22,69
healthcare,22,62
homemaker,20,50
lawyer,21,53


### Step 7. For each combination of occupation and gender, calculate the mean age

In [None]:
users.groupby(['occupation', 'gender'])['age'].mean().round(1)

occupation     gender
administrator  F         40.6
               M         37.2
artist         F         30.3
               M         32.3
doctor         M         43.6
educator       F         39.1
               M         43.1
engineer       F         29.5
               M         36.6
entertainment  F         31.0
               M         29.0
executive      F         44.0
               M         38.2
healthcare     F         39.8
               M         45.4
homemaker      F         34.2
               M         23.0
lawyer         F         39.5
               M         36.2
librarian      F         40.0
               M         40.0
marketing      F         37.2
               M         37.9
none           F         36.5
               M         18.6
other          F         35.5
               M         34.0
programmer     F         32.2
               M         33.2
retired        F         70.0
               M         62.5
salesman       F         27.0
               M  

### Step 8.  For each occupation present the percentage of women and men

In [None]:
users.assign(
   male = users.gender == 'M',
   female = users.gender == 'F' 
) \
.groupby('occupation') \
[['male', 'female']] \
.mean().round(2)

Unnamed: 0_level_0,male,female
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
administrator,0.54,0.46
artist,0.54,0.46
doctor,1.0,0.0
educator,0.73,0.27
engineer,0.97,0.03
entertainment,0.89,0.11
executive,0.91,0.09
healthcare,0.31,0.69
homemaker,0.14,0.86
lawyer,0.83,0.17


In [None]:
# Their approach
# create a data frame and apply count to gender
gender_ocup = users.groupby(['occupation', 'gender']).agg({'gender': 'count'})
gender_ocup.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,gender
occupation,gender,Unnamed: 2_level_1
administrator,F,36
administrator,M,43
artist,F,13
artist,M,15
doctor,M,7
educator,F,26
educator,M,69
engineer,F,2
engineer,M,65
entertainment,F,2


In [None]:
# create a DataFrame and apply count for each occupation
occup_count = users.groupby(['occupation']).agg('count')
occup_count.head(10)

Unnamed: 0_level_0,user_id,age,gender,zip_code,gender_n
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
administrator,79,79,79,79,79
artist,28,28,28,28,28
doctor,7,7,7,7,7
educator,95,95,95,95,95
engineer,67,67,67,67,67
entertainment,18,18,18,18,18
executive,32,32,32,32,32
healthcare,16,16,16,16,16
homemaker,7,7,7,7,7
lawyer,12,12,12,12,12


In [None]:
# divide the gender_ocup per the occup_count and multiply per 100
occup_gender = gender_ocup.div(occup_count, level = "occupation") * 100
occup_gender.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,gender,gender_n,user_id,zip_code
occupation,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
administrator,F,,45.56962,,,
administrator,M,,54.43038,,,
artist,F,,46.428571,,,
artist,M,,53.571429,,,
doctor,M,,100.0,,,


In [None]:
# present all rows from the 'gender column'
occup_gender.loc[: , 'gender']

occupation     gender
administrator  F          45.569620
               M          54.430380
artist         F          46.428571
               M          53.571429
doctor         M         100.000000
educator       F          27.368421
               M          72.631579
engineer       F           2.985075
               M          97.014925
entertainment  F          11.111111
               M          88.888889
executive      F           9.375000
               M          90.625000
healthcare     F          68.750000
               M          31.250000
homemaker      F          85.714286
               M          14.285714
lawyer         F          16.666667
               M          83.333333
librarian      F          56.862745
               M          43.137255
marketing      F          38.461538
               M          61.538462
none           F          44.444444
               M          55.555556
other          F          34.285714
               M          65.714286
progra