# 3.5Pandas之数据分析

## 1. 数据的导入导出

In [1]:
# 数据的导入
import pandas as pd
df = pd.read_csv('data/employees.csv')
print(type(df))
print(df.tail())
print(df.salary.mean())
# 数据的导出
df = df.tail()
df.to_csv('data/new.csv')

<class 'pandas.core.frame.DataFrame'>
     employee_id first_name last_name     email  phone_number      job_id  \
102          202        Pat       Fay      PFAY  603.123.6666      MK_REP   
103          203      Susan    Mavris   SMAVRIS  515.123.7777      HR_REP   
104          204    Hermann      Baer     HBAER  515.123.8888      PR_REP   
105          205    Shelley   Higgins  SHIGGINS  515.123.8080      AC_MGR   
106          206    William     Gietz    WGIETZ  515.123.8181  AC_ACCOUNT   

      salary  commission_pct  manager_id  department_id  
102   6000.0             NaN       201.0           20.0  
103   6500.0             NaN       101.0           40.0  
104  10000.0             NaN       101.0           70.0  
105  12000.0             NaN       101.0          110.0  
106   8300.0             NaN       205.0          110.0  
6461.682242990654


In [2]:
# json
df = pd.read_json('data/data1.json')
print(type(df))

import json
with open('data/test.json') as f:
    data = json.load(f)
# print(data['users'])
print(type(data))
df = pd.DataFrame(data['users'])
print(type(df))
df

<class 'pandas.core.frame.DataFrame'>
<class 'dict'>
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,id,name,age,email,is_active,join_date
0,1,张三,28,zhangsan@example.com,True,2022-03-15
1,2,李四,35,lisi@example.com,False,2021-11-02
2,3,王五,24,wangwu@example.com,True,2023-01-20


## 2. 数据的缺失值处理

In [4]:
# 缺失值的处理
# nan:not a number
import pandas as pd
import numpy as np
s = pd.Series([12,25,np.nan, None, pd.NA])
df = pd.DataFrame([[1,pd.NA,2],[2,3,5],[None,4,6]],columns=['第1列','第2列','第3列'])
print(s)
# 查看是否是缺失值
print(s.isna())
print(s.isnull())
print(df.isna())
print(df.isnull())
print(df.isna().sum(axis=1))
print(s.isna().sum()) #查看缺失值的个数

# 剔除缺失值
print(s.dropna())
print('-'*30)
print(df)
print(df.dropna()) #剔除一整条的记录
print(df.dropna(how='all')) #如果所有的值都是缺失值，删除这一行
print(df.dropna(thresh=1)) #如果至少有n个值不是缺失值，就保留
print(df.dropna(axis=1)) #剔除一整列的记录
print(df.dropna(subset=['第1列'])) #如果某列有缺失值，则删除这一行

# 填充缺失值
df = pd.read_csv('data/weather_withna.csv')
df.tail()
df.isna().sum(axis=0)
df.head()
print(df.fillna({'temp_max':20,'wind':2.5}).tail()) #使用字典来填充
print(df.fillna(df[['temp_max','wind']].mean()).tail()) #使用统计值来填充
print(df.ffill().tail())#用前面的相邻值填充
print(df.bfill().tail())#用后面的相邻值填充

0      12
1      25
2     NaN
3    None
4    <NA>
dtype: object
0    False
1    False
2     True
3     True
4     True
dtype: bool
0    False
1    False
2     True
3     True
4     True
dtype: bool
     第1列    第2列    第3列
0  False   True  False
1  False  False  False
2   True  False  False
     第1列    第2列    第3列
0  False   True  False
1  False  False  False
2   True  False  False
0    1
1    0
2    1
dtype: int64
3
0    12
1    25
dtype: object
------------------------------
   第1列   第2列  第3列
0  1.0  <NA>    2
1  2.0     3    5
2  NaN     4    6
   第1列 第2列  第3列
1  2.0   3    5
   第1列   第2列  第3列
0  1.0  <NA>    2
1  2.0     3    5
2  NaN     4    6
   第1列   第2列  第3列
0  1.0  <NA>    2
1  2.0     3    5
2  NaN     4    6
   第3列
0    2
1    5
2    6
   第1列   第2列  第3列
0  1.0  <NA>    2
1  2.0     3    5
            date  precipitation  temp_max  temp_min  wind weather
1456  2015-12-27            NaN      20.0       NaN   2.5     NaN
1457  2015-12-28            NaN      20.0       NaN   2.5  

## 3.时间数据的处理

In [19]:
# 时间数据的处理
import pandas as pd
d = pd.Timestamp('2015-02-28 10:22')
d1 = pd.Timestamp('2015-02-28 13:22')
print(d)
print(type(d))
print("年：",d.year)
print("月：",d.month)
print("日：",d.day)
print(d.hour, d.minute, d.second)
print("季度：",d.quarter)
print("是否是月底：",d.is_month_end)
# 方法
print("星期几：",d.day_name())
print("转换为天：",d.to_period("D"))
print("转换为季度：",d1.to_period("Q"))
print("转换为年度：",d1.to_period("Y"))
print("转换为月度：",d1.to_period("M"))
print("转换为周维度：",d1.to_period("W"))

2015-02-28 10:22:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
年： 2015
月： 2
日： 28
10 22 0
季度： 1
是否是月底： True
星期几： Saturday
转换为天： 2015-02-28
转换为季度： 2015Q1
转换为年度： 2015
转换为月度： 2015-02
转换为周维度： 2015-02-23/2015-03-01


In [20]:
# 字符串转换为日期类型
a = pd.to_datetime('20150228')
print(a)
print(type(a))
print(a.day_name())

# dataFrame 日期转换
df = pd.DataFrame({
    'sales':[100,200,300],
    'date':['20250601','20250602','20250603']
})
df['datetime'] = pd.to_datetime(df['date'])
df
print(df.info())
print(type(df['datetime']))
df['week']=df['datetime'].dt.day_name()
df['datetime'].dt.year



2015-02-28 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
Saturday
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   sales     3 non-null      int64         
 1   date      3 non-null      object        
 2   datetime  3 non-null      datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 204.0+ bytes
None
<class 'pandas.core.series.Series'>


0    2025
1    2025
2    2025
Name: datetime, dtype: int32

In [22]:
# csv 日期转换
df = pd.read_csv('data/weather.csv',parse_dates=['date'])
df.info()
df['date'].dt.day_name()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1461 non-null   datetime64[ns]
 1   precipitation  1461 non-null   float64       
 2   temp_max       1461 non-null   float64       
 3   temp_min       1461 non-null   float64       
 4   wind           1461 non-null   float64       
 5   weather        1461 non-null   object        
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 68.6+ KB


0          Sunday
1          Monday
2         Tuesday
3       Wednesday
4        Thursday
          ...    
1456       Sunday
1457       Monday
1458      Tuesday
1459    Wednesday
1460     Thursday
Name: date, Length: 1461, dtype: object

In [23]:
# 日期数据作为索引
# df.set_index('date' , inplace=True)#设置原来的df的索引
print(df.loc["2013-01":"2013-02"])

Empty DataFrame
Columns: [date, precipitation, temp_max, temp_min, wind, weather]
Index: []


In [24]:
# 时间间隔
d1 = pd.Timestamp('2013-01-15')
d2 = pd.Timestamp('2023-02-23')
d3 = d2-d1
print(type(d3))
print(d3)

<class 'pandas._libs.tslibs.timedeltas.Timedelta'>
3691 days 00:00:00


In [25]:
df = pd.read_csv('data/weather.csv',parse_dates=['date'])
df.info()
df['delta'] = df['date'] - df['date'][0]
df.set_index('delta',inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1461 non-null   datetime64[ns]
 1   precipitation  1461 non-null   float64       
 2   temp_max       1461 non-null   float64       
 3   temp_min       1461 non-null   float64       
 4   wind           1461 non-null   float64       
 5   weather        1461 non-null   object        
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 68.6+ KB


In [26]:
df
print(df.loc['10 days':'20 days'])

              date  precipitation  temp_max  temp_min  wind weather
delta                                                              
10 days 2012-01-11            0.0       6.1      -1.1   5.1     sun
11 days 2012-01-12            0.0       6.1      -1.7   1.9     sun
12 days 2012-01-13            0.0       5.0      -2.8   1.3     sun
13 days 2012-01-14            4.1       4.4       0.6   5.3    snow
14 days 2012-01-15            5.3       1.1      -3.3   3.2    snow
15 days 2012-01-16            2.5       1.7      -2.8   5.0    snow
16 days 2012-01-17            8.1       3.3       0.0   5.6    snow
17 days 2012-01-18           19.8       0.0      -2.8   5.0    snow
18 days 2012-01-19           15.2      -1.1      -2.8   1.6    snow
19 days 2012-01-20           13.5       7.2      -1.1   2.3    snow
20 days 2012-01-21            3.0       8.3       3.3   8.2    rain


In [27]:
days = pd.date_range("2025-07-03","2026-02-09",freq="W")
days = pd.date_range("2025-07-03",periods=10,freq="QE")
print(days)

DatetimeIndex(['2025-09-30', '2025-12-31', '2026-03-31', '2026-06-30',
               '2026-09-30', '2026-12-31', '2027-03-31', '2027-06-30',
               '2027-09-30', '2027-12-31'],
              dtype='datetime64[ns]', freq='QE-DEC')


In [28]:
df = pd.read_csv('data/weather.csv',parse_dates=['date'])
# 重新采样
df.set_index('date',inplace=True)

In [29]:
df[ ["temp_max","temp_min"]].resample("MS").mean()

Unnamed: 0_level_0,temp_max,temp_min
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-01-01,7.054839,1.541935
2012-02-01,9.275862,3.203448
2012-03-01,9.554839,2.83871
2012-04-01,14.873333,5.993333
2012-05-01,17.66129,8.190323
2012-06-01,18.693333,10.48
2012-07-01,22.906452,12.932258
2012-08-01,25.858065,14.009677
2012-09-01,22.88,11.243333
2012-10-01,15.829032,8.380645


In [30]:
df[ ["temp_max","temp_min"]].resample("YE").mean()

Unnamed: 0_level_0,temp_max,temp_min
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-12-31,15.276776,7.289617
2013-12-31,16.058904,8.153973
2014-12-31,16.99589,8.662466
2015-12-31,17.427945,8.835616


In [31]:
import pandas as pd
data = {
    "name":['alice','alice','bob','alice','jack','bob'],
    "age":[26,25,30,25,35,30],
    'city':['NY','NY','LA','NY','SF','LA']
}
df = pd.DataFrame(data)

In [32]:
df.duplicated() #一整条记录都是一样的，标记为重复，返回True
df.drop_duplicates(subset=['name']) #根据指定列去重
df.drop_duplicates(subset=['name'],keep='last') #保留最后一次出现的行

Unnamed: 0,name,age,city
3,alice,25,NY
4,jack,35,SF
5,bob,30,LA


## 4.数据类型的转换

In [5]:
# 数据类型的转换
df = pd.read_csv('data/sleep.csv')
df.dtypes

person_id                    int64
gender                      object
age                          int64
occupation                  object
sleep_duration             float64
sleep_quality              float64
physical_activity_level      int64
stress_level                 int64
bmi_category                object
blood_pressure              object
heart_rate                   int64
daily_steps                  int64
sleep_disorder              object
dtype: object

In [6]:
df['age'] = df['age'].astype('int16')

In [7]:
df['gender'] = df['gender'].astype('category')

In [8]:
df.gender

0        Male
1      Female
2        Male
3        Male
4        Male
        ...  
395    Female
396    Female
397    Female
398    Female
399      Male
Name: gender, Length: 400, dtype: category
Categories (2, object): ['Female', 'Male']

In [9]:
df['is_male'] = df['gender'].map({'Female':True,'Male':False})

In [10]:
df.is_male

0      False
1       True
2      False
3      False
4      False
       ...  
395     True
396     True
397     True
398     True
399    False
Name: is_male, Length: 400, dtype: category
Categories (2, bool): [True, False]

## 5.数据变形

In [11]:
#数据变形
import pandas as pd
data = {
    'ID': [1, 2],
    'name':['alice','bob'],
    'Math': [90, 85],
    'English': [88, 92],
    'Science': [95, 89]
}
df = pd.DataFrame(data)
print(df)
df.T   #行列转置
# 宽表转换成长表
df2 = pd.melt(df,id_vars=['ID','name'],var_name='科目',value_name='分数')
df2.sort_values('name')
print(df2)
# 长表转宽表
pd.pivot(df2,index=['ID','name'],columns='科目',values='分数')

   ID   name  Math  English  Science
0   1  alice    90       88       95
1   2    bob    85       92       89
   ID   name       科目  分数
0   1  alice     Math  90
1   2    bob     Math  85
2   1  alice  English  88
3   2    bob  English  92
4   1  alice  Science  95
5   2    bob  Science  89


Unnamed: 0_level_0,科目,English,Math,Science
ID,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,alice,88,90,95
2,bob,92,85,89


In [12]:
data = {
    'ID': [1, 2],
    'name':['alice smith','bob smith'],
    'Math': [90, 85],
    'English': [88, 92],
    'Science': [95, 89]
}
df = pd.DataFrame(data)
# 分列
df[['first','last']]  = df['name'].str.split(" ",expand=True)
df = pd.read_csv('data/sleep.csv')
df = df[['person_id','blood_pressure']]
df[['high','low']] = df['blood_pressure'].str.split('/',expand=True)
df['high']=df['high'].astype('int64')
df['low']=df['low'].astype('int64')
df.info()
df.high.mean()
df.low.mean()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   person_id       400 non-null    int64 
 1   blood_pressure  400 non-null    object
 2   high            400 non-null    int64 
 3   low             400 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 12.6+ KB


73.04

## 6.数据分箱

In [13]:
# 数据分箱 pd.cut(x,bins,labels)
import pandas as pd
df = pd.read_csv('data/employees.csv')
df.head(10)

Unnamed: 0,employee_id,first_name,last_name,email,phone_number,job_id,salary,commission_pct,manager_id,department_id
0,100,Steven,King,SKING,515.123.4567,AD_PRES,24000.0,,,90.0
1,101,N_ann,Kochhar,NKOCHHAR,515.123.4568,AD_VP,17000.0,,100.0,90.0
2,102,Lex,De Haan,LDEHAAN,515.123.4569,AD_VP,17000.0,,100.0,90.0
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,IT_PROG,9000.0,,102.0,60.0
4,104,Bruce,Ernst,BERNST,590.423.4568,IT_PROG,6000.0,,103.0,60.0
5,105,David,Austin,DAUSTIN,590.423.4569,IT_PROG,4800.0,,103.0,60.0
6,106,Valli,Pataballa,VPATABAL,590.423.4560,IT_PROG,4800.0,,103.0,60.0
7,107,Diana,Lorentz,DLORENTZ,590.423.5567,IT_PROG,4200.0,,103.0,60.0
8,108,Nancy,Greenberg,NGREENBE,515.124.4569,FI_MGR,12000.0,,101.0,100.0
9,109,Daniel,Faviet,DFAVIET,515.124.4169,FI_ACCOUNT,9000.0,,108.0,100.0


In [14]:
df1 = df.head(10)[['employee_id','salary']]
df1

Unnamed: 0,employee_id,salary
0,100,24000.0
1,101,17000.0
2,102,17000.0
3,103,9000.0
4,104,6000.0
5,105,4800.0
6,106,4800.0
7,107,4200.0
8,108,12000.0
9,109,9000.0


In [15]:
pd.cut(df1['salary'],bins=3) #bins=n，分成n段区间，起始值、结束值是所有数据的最小值、最大值
#4180~14100~24000
pd.cut(df1['salary'],bins=3).value_counts()
pd.cut(df1['salary'],bins=[0,10000,20000,30000])#bins=list，分成n段区间
pd.cut(df1['salary'],bins=[0,10000,20000,30000]).value_counts()
df1['收入范围'] =pd.cut(df1['salary'],bins=[0,10000,20000,30000],labels=['低','中','高'])#bins=list，分成n段区间
pd.qcut(df1['salary'],3).value_counts()

salary
(12000.0, 24000.0]    4
(4199.999, 6000.0]    3
(6000.0, 12000.0]     3
Name: count, dtype: int64

In [16]:
# 睡眠数据
df = pd.read_csv('data/sleep.csv')
df1 = df.head(10)[['person_id','sleep_quality']]
df1
df['睡眠质量'] = pd.cut(df['sleep_quality'],bins=3,labels=['差','中','优'])
df['睡眠质量'].value_counts()
df.head(10)
df['gender']=df['gender'].astype('category')
df['gender'].value_counts()
# 字符串-->类别-->统计
# 数值-->分箱-->统计
print(df['gender'].dtype)
print(df['睡眠质量'].dtype)

category
category


In [17]:
# df.rename()   df.set_index()  df.reset_index()
df = pd.DataFrame({
    'name':['jack','alice','tom','bob'],
    'age':[20,30,40,50],
    'gender':['female','male','female','male']
})
df.set_index("name",inplace=True)
df.reset_index(inplace=True)
df.rename(columns={"age":"年龄"},index={0:4})

Unnamed: 0,name,年龄,gender
4,jack,20,female
1,alice,30,male
2,tom,40,female
3,bob,50,male


In [18]:
df.index=[1,2,3,4]
df.columns=["姓名",'年龄',"性别"]
df

Unnamed: 0,姓名,年龄,性别
1,jack,20,female
2,alice,30,male
3,tom,40,female
4,bob,50,male


## 7.分组聚合

In [33]:
# 分组聚合
# df.groupby('分组的字段')['聚合的字段'].聚合函数()
import pandas as pd
df = pd.read_csv('data/employees.csv')
df = df.dropna(subset=['department_id'])
df['department_id'] = df['department_id'].astype('int64')
# 计算不同部门的平均薪资
df.groupby('department_id').groups #查看分组
df.groupby('department_id').get_group(20) #查看具体的某个分组数据
df2 = df.groupby('department_id')[['salary']].mean()
df2['salary'] = df2['salary'].round(2)
df2=df2.reset_index()
df2.sort_values('salary',ascending=False)

Unnamed: 0,department_id,salary
8,90,19333.33
10,110,10150.0
6,70,10000.0
1,20,9500.0
7,80,8955.88
9,100,8600.0
3,40,6500.0
5,60,5760.0
0,10,4400.0
2,30,4150.0


In [34]:
# 计算不同部门不同岗位的人的平均薪资
df2=df.groupby(['department_id','job_id'])[['salary']].mean()
df2=df2.reset_index()
df2['salary'] = df2['salary'].round(1)
df2.sort_values('salary',ascending=False)

Unnamed: 0,department_id,job_id,salary
13,90,AD_PRES,24000.0
14,90,AD_VP,17000.0
1,20,MK_MAN,13000.0
11,80,SA_MAN,12200.0
18,110,AC_MGR,12000.0
16,100,FI_MGR,12000.0
4,30,PU_MAN,11000.0
10,70,PR_REP,10000.0
12,80,SA_REP,8396.6
17,110,AC_ACCOUNT,8300.0


# 3.6数据分析案例

## 1. 企鹅数据分析

In [35]:
# 企鹅数据分析
# 1. 导入必要的库
import pandas as pd
import numpy as np
# 2. 导入数据 喙
df = pd.read_csv('data/penguins.csv')
df.head(5)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [36]:
# 3. 数据清洗
# 缺失值的检查
print(df.isna().sum())
df.dropna(inplace=True)

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64


In [37]:
# 4. 数据特征的构造
df['sex'] = df['sex'].astype('category')
df['bill_ratio'] = df['bill_length_mm']/df['bill_depth_mm']
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,bill_ratio
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,2.090909
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,2.270115
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,2.238889
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,1.901554
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male,1.907767


In [38]:
# 5. 数据分析
# 数据分箱-把体重分为三个等级
labels = ['低','中','高']
df['mass_level'] = pd.cut(df['body_mass_g'],bins=3,labels=labels)
print(df['mass_level'].value_counts())
# 按岛屿、性别分组分析
df.groupby(['sex','island']).agg({
    'body_mass_g':['mean','count'],
})

mass_level
低    150
中    128
高     55
Name: count, dtype: int64


  df.groupby(['sex','island']).agg({


Unnamed: 0_level_0,Unnamed: 1_level_0,body_mass_g,body_mass_g
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
sex,island,Unnamed: 2_level_2,Unnamed: 3_level_2
Female,Biscoe,4319.375,80
Female,Dream,3446.311475,61
Female,Torgersen,3395.833333,24
Male,Biscoe,5104.518072,83
Male,Dream,3987.096774,62
Male,Torgersen,4034.782609,23


## 2.睡眠质量分析

In [42]:
# 睡眠质量分析
# 1.导入库
import pandas as pd
import numpy as np
# 2.导入数据
df = pd.read_csv('data/sleep.csv')
df.head()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   person_id                400 non-null    int64  
 1   gender                   400 non-null    object 
 2   age                      400 non-null    int64  
 3   occupation               400 non-null    object 
 4   sleep_duration           400 non-null    float64
 5   sleep_quality            400 non-null    float64
 6   physical_activity_level  400 non-null    int64  
 7   stress_level             400 non-null    int64  
 8   bmi_category             400 non-null    object 
 9   blood_pressure           400 non-null    object 
 10  heart_rate               400 non-null    int64  
 11  daily_steps              400 non-null    int64  
 12  sleep_disorder           110 non-null    object 
dtypes: float64(2), int64(6), object(5)
memory usage: 40.8+ KB


Unnamed: 0,person_id,age,sleep_duration,sleep_quality,physical_activity_level,stress_level,heart_rate,daily_steps
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,200.5,39.95,8.04125,6.12575,64.985,5.4725,75.99,11076.51
std,115.614301,14.038883,2.390787,1.975733,32.297874,2.80873,15.099334,5364.789364
min,1.0,18.0,4.1,1.0,10.0,1.0,50.0,2067.0
25%,100.75,29.0,5.9,4.7,35.0,3.0,63.0,6165.25
50%,200.5,40.0,8.2,6.1,65.5,5.0,77.0,11785.5
75%,300.25,49.0,10.125,7.425,94.0,8.0,90.0,15878.0
max,400.0,90.0,12.0,10.0,120.0,10.0,100.0,19958.0


In [43]:
# 3.数据清洗
df.isna().sum()
df.drop(columns='sleep_disorder',inplace=True)

In [44]:
# 4. 数据特征的构造
df['gender'] = df['gender'].astype('category')
df['occupation'] = df['occupation'].astype('category')
df['bmi_category'] = df['bmi_category'].astype('category')
df[['high','low']]=df['blood_pressure'].str.split('/',expand=True)

# 睡眠质量的分箱
labels = ['差','中','优']
df['quality_level'] = pd.cut(df['sleep_quality'],bins=3,labels=labels)
age_labels=['青少年','中年','老年']
df['age_level'] = pd.cut(df['age'],bins=3,labels=age_labels)
df.head()

Unnamed: 0,person_id,gender,age,occupation,sleep_duration,sleep_quality,physical_activity_level,stress_level,bmi_category,blood_pressure,heart_rate,daily_steps,high,low,quality_level,age_level
0,1,Male,29,Manual Labor,7.4,7.0,41,7,Obese,124/70,91,8539,124,70,中,青少年
1,2,Female,43,Retired,4.2,4.9,41,5,Obese,131/86,81,18754,131,86,中,中年
2,3,Male,44,Retired,6.1,6.0,107,4,Underweight,122/70,81,2857,122,70,中,中年
3,4,Male,29,Office Worker,8.3,10.0,20,10,Obese,124/72,55,6886,124,72,优,青少年
4,5,Male,67,Retired,9.1,9.5,19,4,Overweight,133/78,97,14945,133,78,优,老年


In [45]:
# 5.数据的统计、分析
print(df['bmi_category'].value_counts())

bmi_category
Overweight     109
Underweight    102
Obese           98
Normal          91
Name: count, dtype: int64


In [46]:
# 根据不同的bmi分组，睡眠质量
df.groupby(['age_level','bmi_category']).agg({
    'sleep_duration':'mean',
    'sleep_quality':'mean',
    'stress_level':'mean'
})

  df.groupby(['age_level','bmi_category']).agg({


Unnamed: 0_level_0,Unnamed: 1_level_0,sleep_duration,sleep_quality,stress_level
age_level,bmi_category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
青少年,Normal,8.1,6.332,4.86
青少年,Obese,8.25,6.253448,5.534483
青少年,Overweight,8.214286,6.171429,5.31746
青少年,Underweight,7.603279,5.883607,5.42623
中年,Normal,7.422222,6.65,4.944444
中年,Obese,7.805556,6.216667,5.888889
中年,Overweight,8.246154,5.95641,5.974359
中年,Underweight,8.4975,5.9075,5.75
老年,Normal,7.42,4.24,4.2
老年,Obese,7.9,5.025,8.0
