# Occupation 数据集

### Step 1. 导入库

In [1]:
import numpy as np
import pandas as pd
import ssl
ssl._create_default_https_context = ssl._create_unverified_context # 全局取消证书验证

### Step 2. 导入数据，命名为users，并使用user_id作为索引： [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user)

In [2]:
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user'
users = pd.read_table(url, sep='|')
users.set_index('user_id', inplace=True)

### Step 3. 查看数据

In [3]:
users.shape # (943, 4)
users.index # Int64Index([  1,   2,   3, ..., 939, 940, 941, 942, 943], dtype='int64', name='user_id', length=943)
users.columns # Index(['user_id', 'age', 'gender', 'occupation', 'zip_code'], dtype='object')
users.dtypes

age            int64
gender        object
occupation    object
zip_code      object
dtype: object

In [4]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 943 entries, 1 to 943
Data columns (total 4 columns):
age           943 non-null int64
gender        943 non-null object
occupation    943 non-null object
zip_code      943 non-null object
dtypes: int64(1), object(3)
memory usage: 36.8+ KB


In [5]:
users.tail(5)

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
939,26,F,student,33319
940,32,M,administrator,2215
941,20,M,student,97229
942,48,F,librarian,78209
943,22,M,student,77841


In [6]:
users.describe() # 只针对数值类型
users.describe(include='all')
users.occupation.describe()

count         943
unique         21
top       student
freq          196
Name: occupation, dtype: object

### Step 4. 一共多少种occupation?

In [48]:
users.occupation.nunique()

21

### Step 5. 出现频率最高的occupation?

In [50]:
users.occupation.value_counts().head(1)

student    196
Name: occupation, dtype: int64

### Step 6. 数据集的平均年龄?

In [51]:
users.age.mean()

34.05196182396607

### Step 7. 不同occupation的平均年龄

In [52]:
users.groupby('occupation').age.mean()

occupation
administrator    38.746835
artist           31.392857
doctor           43.571429
educator         42.010526
engineer         36.388060
entertainment    29.222222
executive        38.718750
healthcare       41.562500
homemaker        32.571429
lawyer           36.750000
librarian        40.000000
marketing        37.615385
none             26.555556
other            34.523810
programmer       33.121212
retired          63.071429
salesman         35.666667
scientist        35.548387
student          22.081633
technician       33.148148
writer           36.311111
Name: age, dtype: float64

### Step 8.  不同occupation男性与女性的比例

In [53]:
users.groupby('occupation').gender.value_counts(normalize=True)

occupation     gender
administrator  M         0.544304
               F         0.455696
artist         M         0.535714
               F         0.464286
doctor         M         1.000000
educator       M         0.726316
               F         0.273684
engineer       M         0.970149
               F         0.029851
entertainment  M         0.888889
               F         0.111111
executive      M         0.906250
               F         0.093750
healthcare     F         0.687500
               M         0.312500
homemaker      F         0.857143
               M         0.142857
lawyer         M         0.833333
               F         0.166667
librarian      F         0.568627
               M         0.431373
marketing      M         0.615385
               F         0.384615
none           M         0.555556
               F         0.444444
other          M         0.657143
               F         0.342857
programmer     M         0.909091
               F         0

### Step 9. 不同occupation男性的比例，并降序排列

In [10]:
occupation_gender_size = users.groupby(['occupation', 'gender']).size() # 统计不同职业男生人数和女生人数
occupation_size = users.groupby(['occupation']).size() # 统计不同职业人数
occupation_gender_rate = occupation_gender_size/occupation_size  # 统计不同职业男生比例和女生比例
occupation_gender_rate[(slice(None), 'M')].sort_values(ascending=False).head()   # 利用多重索引的切片，选择不同职业男生比列

occupation
doctor        1.000000
engineer      0.970149
technician    0.962963
retired       0.928571
programmer    0.909091
dtype: float64

In [11]:
occupation_gender_rate = users.groupby('occupation').gender.value_counts(normalize=True)
occupation_gender_rate[(slice(None), 'M')].sort_values(ascending=False).head()

occupation
doctor        1.000000
engineer      0.970149
technician    0.962963
retired       0.928571
programmer    0.909091
Name: gender, dtype: float64

In [12]:
users['gender_int'] = np.where(users.gender=='M', 1, 0)
result = (users.groupby('occupation').gender_int.sum()) / (users.groupby('occupation').gender_int.size())
result.sort_values(ascending=False).head()

occupation
doctor        1.000000
engineer      0.970149
technician    0.962963
retired       0.928571
programmer    0.909091
Name: gender_int, dtype: float64

### Step 10. 不同occupation年龄的最大值和最小值


In [13]:
users.groupby('occupation').age.agg([('min_age', np.min), ('max_age', np.max)]).head()

Unnamed: 0_level_0,min_age,max_age
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
administrator,21,70
artist,19,48
doctor,28,64
educator,23,63
engineer,22,70


### Step 11. 不同occupation、gender的平均年龄

In [14]:
users.groupby(['occupation', 'gender']).age.mean().head()

occupation     gender
administrator  F         40.638889
               M         37.162791
artist         F         30.307692
               M         32.333333
doctor         M         43.571429
Name: age, dtype: float64