# Occupation

### Introduction:

Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

### Step 3. Assign it to a variable called users.

In [3]:
users='https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user'
df = pd.read_csv(users, sep='|')
print(df.head(25))

    user_id  age gender     occupation zip_code
0         1   24      M     technician    85711
1         2   53      F          other    94043
2         3   23      M         writer    32067
3         4   24      M     technician    43537
4         5   33      F          other    15213
5         6   42      M      executive    98101
6         7   57      M  administrator    91344
7         8   36      M  administrator    05201
8         9   29      M        student    01002
9        10   53      M         lawyer    90703
10       11   39      F          other    30329
11       12   28      F          other    06405
12       13   47      M       educator    29206
13       14   45      M      scientist    55106
14       15   49      F       educator    97301
15       16   21      M  entertainment    10309
16       17   30      M     programmer    06355
17       18   35      F          other    37212
18       19   40      M      librarian    02138
19       20   42      F      homemaker  

### Step 4. Discover what is the mean age per occupation

In [4]:
mean_age_per_occupation = df.groupby('occupation')['age'].mean().round(2)
print(mean_age_per_occupation)

occupation
administrator    38.75
artist           31.39
doctor           43.57
educator         42.01
engineer         36.39
entertainment    29.22
executive        38.72
healthcare       41.56
homemaker        32.57
lawyer           36.75
librarian        40.00
marketing        37.62
none             26.56
other            34.52
programmer       33.12
retired          63.07
salesman         35.67
scientist        35.55
student          22.08
technician       33.15
writer           36.31
Name: age, dtype: float64


### Step 5. Discover the Male ratio per occupation and sort it from the most to the least

In [10]:
gender_counts = df.groupby(['occupation', 'gender']).size().unstack(fill_value=0)

gender_counts['male_ratio'] = gender_counts['M'] / (gender_counts['M'] + gender_counts['F'])

male_ratio_sorted = gender_counts['male_ratio'].sort_values(ascending=False)

male_ratio_df = male_ratio_sorted.reset_index()
male_ratio_df.columns = ['Occupation', 'Male Ratio']

print(male_ratio_df)

       Occupation  Male Ratio
0          doctor    1.000000
1        engineer    0.970149
2      technician    0.962963
3         retired    0.928571
4      programmer    0.909091
5       executive    0.906250
6       scientist    0.903226
7   entertainment    0.888889
8          lawyer    0.833333
9        salesman    0.750000
10       educator    0.726316
11        student    0.693878
12          other    0.657143
13      marketing    0.615385
14         writer    0.577778
15           none    0.555556
16  administrator    0.544304
17         artist    0.535714
18      librarian    0.431373
19     healthcare    0.312500
20      homemaker    0.142857


### Step 6. For each occupation, calculate the minimum and maximum ages

In [11]:
age_ranges = df.groupby('occupation')['age'].agg(['min', 'max'])

age_ranges = age_ranges.reset_index()

age_ranges.columns = ['Occupation', 'Minimum Age', 'Maximum Age']

print(age_ranges)

       Occupation  Minimum Age  Maximum Age
0   administrator           21           70
1          artist           19           48
2          doctor           28           64
3        educator           23           63
4        engineer           22           70
5   entertainment           15           50
6       executive           22           69
7      healthcare           22           62
8       homemaker           20           50
9          lawyer           21           53
10      librarian           23           69
11      marketing           24           55
12           none           11           55
13          other           13           64
14     programmer           20           63
15        retired           51           73
16       salesman           18           66
17      scientist           23           55
18        student            7           42
19     technician           21           55
20         writer           18           60


### Step 7. For each combination of occupation and gender, calculate the mean age

In [13]:
mean_age = df.groupby(['occupation', 'gender'])['age'].mean().round(2)

mean_age_df = mean_age.reset_index()
mean_age_df.columns = ['Occupation', 'Gender', 'Mean Age'] 

print(mean_age_df)

       Occupation Gender  Mean Age
0   administrator      F     40.64
1   administrator      M     37.16
2          artist      F     30.31
3          artist      M     32.33
4          doctor      M     43.57
5        educator      F     39.12
6        educator      M     43.10
7        engineer      F     29.50
8        engineer      M     36.60
9   entertainment      F     31.00
10  entertainment      M     29.00
11      executive      F     44.00
12      executive      M     38.17
13     healthcare      F     39.82
14     healthcare      M     45.40
15      homemaker      F     34.17
16      homemaker      M     23.00
17         lawyer      F     39.50
18         lawyer      M     36.20
19      librarian      F     40.00
20      librarian      M     40.00
21      marketing      F     37.20
22      marketing      M     37.88
23           none      F     36.50
24           none      M     18.60
25          other      F     35.47
26          other      M     34.03
27     programmer   

### Step 8.  For each occupation present the percentage of women and men

In [16]:
result = pd.concat([gender_counts, gender_pcts], axis=1)
result.columns = ['F_count', 'M_count', '% Female', '% Male']
result.reset_index(inplace=True)
print(result)

       occupation  F_count  M_count  % Female  % Male
0   administrator       36       43      45.6    54.4
1          artist       13       15      46.4    53.6
2          doctor        0        7       0.0   100.0
3        educator       26       69      27.4    72.6
4        engineer        2       65       3.0    97.0
5   entertainment        2       16      11.1    88.9
6       executive        3       29       9.4    90.6
7      healthcare       11        5      68.8    31.2
8       homemaker        6        1      85.7    14.3
9          lawyer        2       10      16.7    83.3
10      librarian       29       22      56.9    43.1
11      marketing       10       16      38.5    61.5
12           none        4        5      44.4    55.6
13          other       36       69      34.3    65.7
14     programmer        6       60       9.1    90.9
15        retired        1       13       7.1    92.9
16       salesman        3        9      25.0    75.0
17      scientist        3  