# 3.3 Pandas之DataFrame

## 1. DataFrame的创建方式

In [1]:
import pandas as pd
import numpy as np

In [2]:
# 通过series来创建
s1 = pd.Series([1,2,3,4,5])
s2 = pd.Series([6,7,8,9,10])
df = pd.DataFrame({"第1列":s1,"第2列":s2})

df

Unnamed: 0,第1列,第2列
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [3]:
# 通过字典来创建
df = pd.DataFrame(
    {
        "name":["tom",'jack','alice','bob','allen'],
        "age":[15,17,20,26,30],
        "score":[60.5,80,30.6,70,83.5]
    },index=[1,2,3,4,5],columns=["name","score","age"]
)

df

Unnamed: 0,name,score,age
1,tom,60.5,15
2,jack,80.0,17
3,alice,30.6,20
4,bob,70.0,26
5,allen,83.5,30


## 2. DataFrame的属性

In [6]:
print(f'行索引：{df.index}')
print(f'列标签：{df.columns}')
print(f'值：{df.values}')

行索引：Index([1, 2, 3, 4, 5], dtype='int64')
列标签：Index(['name', 'score', 'age'], dtype='object')
值：[['tom' 60.5 15]
 ['jack' 80.0 17]
 ['alice' 30.6 20]
 ['bob' 70.0 26]
 ['allen' 83.5 30]]


In [7]:
print('维度：',df.ndim)
print('形状:', df.shape)
print('元素个数：', df.size)
print('数据类型：')
print(df.dtypes)

维度： 2
形状: (5, 3)
元素个数： 15
数据类型：
name      object
score    float64
age        int64
dtype: object


In [8]:
# 行列转置
print(df.T)

          1     2      3     4      5
name    tom  jack  alice   bob  allen
score  60.5  80.0   30.6  70.0   83.5
age      15    17     20    26     30


## 3. DataFrame获取数据

In [11]:
# 获取元素 loc显  iloc隐  at  iat
# 某行
print(df.loc[4])

print('-' * 20)

print(df.iloc[3])

name      bob
score    70.0
age        26
Name: 4, dtype: object
--------------------
name      bob
score    70.0
age        26
Name: 4, dtype: object


In [12]:
# 某列
print(df.loc[:,'name'])
print(df.iloc[:,0])

1      tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
1      tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object


In [13]:
# 单个元素
print(df.at[3,'score'])
print(df.iat[2,1])
print(df.loc[3,'score'])
print(df.iloc[2,1])

30.6
30.6
30.6
30.6


In [14]:
# 获取单列数据
print(df['name'])
print(type(df['name']))
print(df.name)
print(type(df.name))
print(df[['name']])
print(type(df[['name']]))
df[['name']]

1      tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
<class 'pandas.core.series.Series'>
1      tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
<class 'pandas.core.series.Series'>
    name
1    tom
2   jack
3  alice
4    bob
5  allen
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,name
1,tom
2,jack
3,alice
4,bob
5,allen


In [15]:
print(df[['name','score']]) # 多列数据的获取

    name  score
1    tom   60.5
2   jack   80.0
3  alice   30.6
4    bob   70.0
5  allen   83.5


In [16]:
# 查看部分数据
print(df.head(2))

   name  score  age
1   tom   60.5   15
2  jack   80.0   17


In [17]:
print(df.tail(3))

    name  score  age
3  alice   30.6   20
4    bob   70.0   26
5  allen   83.5   30


In [18]:
# 使用布尔索引筛选数据
df[df.score>70]
df[ (df['score']>70) & (df.age<20)]

Unnamed: 0,name,score,age
2,jack,80.0,17


In [20]:
# 随机抽样
df.sample(3)

Unnamed: 0,name,score,age
1,tom,60.5,15
3,alice,30.6,20
2,jack,80.0,17


## 4. DataFrame的常用方法

In [21]:
df = pd.DataFrame(
    {
        "name":["tom",'jack','alice','bob','allen'],
        "age":[15,17,20,26,30],
        "score":[60.5,80,30.6,70,83.5]
    },index=[1,2,3,4,5],columns=["name","score","age"]
)

In [23]:
print(df.head()) #查看前n行数据，默认是5行

print('-' * 20)

print(df.tail(1))  #查看后n行数据，默认是5行

    name  score  age
1    tom   60.5   15
2   jack   80.0   17
3  alice   30.6   20
4    bob   70.0   26
5  allen   83.5   30
--------------------
    name  score  age
5  allen   83.5   30


In [24]:
print(df.isin(['jack',20]))  #查看元素是否包含在参数集合中

    name  score    age
1  False  False  False
2   True  False  False
3  False  False   True
4  False  False  False
5  False  False  False


In [25]:
print(df.isna()) # 查看元素是否是缺失值

    name  score    age
1  False  False  False
2  False  False  False
3  False  False  False
4  False  False  False
5  False  False  False


In [28]:
print(df['score'].sum())  #某一列的总和
print(df.score.max())  #最大值
print(df.age.min())  #最小值
print(df.score.mean())  #平均数
print(df.score.median())  #中位数
print(df.age.mode())  #众数

385.1
83.5
15
64.18333333333334
65.25
0    15
Name: age, dtype: int64


In [27]:
df = pd.DataFrame(
    {
        "name":["tom","tom",'jack','alice','bob','allen'],
        "age":[15,15,15,20,26,30],
        "score":[60.5,60.5,80,30.6,70,83.5]
    },index=[1,2,3,4,5,6],columns=["name","score","age"]
)

In [29]:
print(df.score.std())  #标准差
print(df.score.var()) #方差
print(df.score.quantile(0.25))  #分位数

19.037375519400427
362.4216666666667
60.5


In [30]:
print(df.describe())

           score        age
count   6.000000   6.000000
mean   64.183333  20.166667
std    19.037376   6.493587
min    30.600000  15.000000
25%    60.500000  15.000000
50%    65.250000  17.500000
75%    77.500000  24.500000
max    83.500000  30.000000


In [31]:
print(df.count())  #每一列非缺失值的个数

name     6
score    6
age      6
dtype: int64


In [32]:
print(df.value_counts()) #出现的次数

name   score  age
tom    60.5   15     2
alice  30.6   20     1
allen  83.5   30     1
bob    70.0   26     1
jack   80.0   15     1
Name: count, dtype: int64


In [33]:
print(df.drop_duplicates())

    name  score  age
1    tom   60.5   15
3   jack   80.0   15
4  alice   30.6   20
5    bob   70.0   26
6  allen   83.5   30


In [34]:
print(df.duplicated(subset=['age']))  #查看是否重复

1    False
2     True
3     True
4    False
5    False
6    False
dtype: bool


In [35]:
df.sample(2) #随机抽样

Unnamed: 0,name,score,age
6,allen,83.5,30
2,tom,60.5,15


In [36]:
print(df.replace(15,30))

    name  score  age
1    tom   60.5   30
2    tom   60.5   30
3   jack   80.0   30
4  alice   30.6   20
5    bob   70.0   26
6  allen   83.5   30


In [37]:
df.cumsum()
df.cummin(axis=0)

Unnamed: 0,name,score,age
1,tom,60.5,15
2,tom,60.5,15
3,jack,60.5,15
4,alice,30.6,15
5,alice,30.6,15
6,alice,30.6,15


In [38]:
print(df.sort_index(ascending=False))

    name  score  age
6  allen   83.5   30
5    bob   70.0   26
4  alice   30.6   20
3   jack   80.0   15
2    tom   60.5   15
1    tom   60.5   15


In [39]:
print(df.sort_values(by='score'))

    name  score  age
4  alice   30.6   20
1    tom   60.5   15
2    tom   60.5   15
5    bob   70.0   26
3   jack   80.0   15
6  allen   83.5   30


In [40]:
df = pd.DataFrame(
    {
        "name":["tom","tom",'jack','alice','bob','allen'],
        "age":[15,15,15,20,26,30],
        "score":[60.5,60.5,80,30.6,70,80]
    },index=[1,2,3,4,5,6],columns=["name","score","age"]
)

In [41]:
print(df.sort_values(by=['score','age'],ascending=[True,False]))

    name  score  age
4  alice   30.6   20
1    tom   60.5   15
2    tom   60.5   15
5    bob   70.0   26
6  allen   80.0   30
3   jack   80.0   15


In [42]:
df.nlargest(2,columns=['score','age'])
df.nsmallest(2,columns=['score','age'])

Unnamed: 0,name,score,age
4,alice,30.6,20
1,tom,60.5,15


# 3.4 DataFrame案例分析

## 1. 学生成绩分析

In [44]:
'''
案例1：学生成绩分析
场景：某班级的学生成绩数据如下，请完成以下任务：
1. 计算每位学生的总分和平均分。
2. 找出数学成绩高于90分或英语成绩高于85分的学生。
3. 按总分从高到低排序，并输出前3名学生。
'''
import pandas as pd
data = {
    '姓名': ['张三', '李四', '王五', '赵六', '钱七'],
    '数学': [85, 92, 78, 88, 95],
    '英语': [90, 88, 85, 92, 80],
    '物理': [75, 80, 88, 85, 90]
}
scores = pd.DataFrame(data)
scores

Unnamed: 0,姓名,数学,英语,物理
0,张三,85,90,75
1,李四,92,88,80
2,王五,78,85,88
3,赵六,88,92,85
4,钱七,95,80,90


In [46]:
#1. 计算每位学生的总分和平均分。
scores['总分'] = scores[['数学','英语','物理']].sum(axis=1)
scores['平均分'] = scores['总分'] / 3
scores['平均分2'] = scores[['数学','英语','物理']].mean(axis=1)
scores

Unnamed: 0,姓名,数学,英语,物理,总分,平均分,平均分2
0,张三,85,90,75,250,83.333333,83.333333
1,李四,92,88,80,260,86.666667,86.666667
2,王五,78,85,88,251,83.666667,83.666667
3,赵六,88,92,85,265,88.333333,88.333333
4,钱七,95,80,90,265,88.333333,88.333333


In [47]:
#2. 找出数学成绩高于90分或英语成绩高于85分的学生。
scores[ (scores['数学']>90 ) | (scores['英语']>85 )  ]

Unnamed: 0,姓名,数学,英语,物理,总分,平均分,平均分2
0,张三,85,90,75,250,83.333333,83.333333
1,李四,92,88,80,260,86.666667,86.666667
3,赵六,88,92,85,265,88.333333,88.333333
4,钱七,95,80,90,265,88.333333,88.333333


In [48]:
#3. 按总分从高到低排序，并输出前3名学生。
r1 = scores.sort_values('总分',ascending=False).head(3)
r2 =scores.nlargest(3,columns=['总分'])
print(r1)
print(r2)

   姓名  数学  英语  物理   总分        平均分       平均分2
3  赵六  88  92  85  265  88.333333  88.333333
4  钱七  95  80  90  265  88.333333  88.333333
1  李四  92  88  80  260  86.666667  86.666667
   姓名  数学  英语  物理   总分        平均分       平均分2
3  赵六  88  92  85  265  88.333333  88.333333
4  钱七  95  80  90  265  88.333333  88.333333
1  李四  92  88  80  260  86.666667  86.666667


## 2. 销售数据分析

In [50]:
'''
案例2：销售数据分析
场景：某公司销售数据如下，请完成以下任务：
1. 计算每种产品的总销售额（销售额 = 单价 × 销量）。
2. 找出销售额最高的产品。
3. 按销售额从高到低排序，并输出所有产品信息。
'''
import pandas as pd

data = {
    '产品名称': ['A', 'B', 'C', 'D'],
    '单价': [100, 150, 200, 120],
    '销量': [50, 30, 20, 40]
}

df = pd.DataFrame(data)

In [52]:
df['总销售额'] = df['单价']*df['销量']
df

Unnamed: 0,产品名称,单价,销量,总销售额
0,A,100,50,5000
1,B,150,30,4500
2,C,200,20,4000
3,D,120,40,4800


In [53]:
df.nlargest(1,columns=['总销售额'])
df

Unnamed: 0,产品名称,单价,销量,总销售额
0,A,100,50,5000
1,B,150,30,4500
2,C,200,20,4000
3,D,120,40,4800


In [54]:
df.sort_values('总销售额',ascending=False)
df

Unnamed: 0,产品名称,单价,销量,总销售额
0,A,100,50,5000
1,B,150,30,4500
2,C,200,20,4000
3,D,120,40,4800


## 3. 电商用户行为分析

In [63]:
'''案例3：电商用户行为分析
场景：某电商平台的用户行为数据如下，请完成以下任务：
1. 计算每位用户的总消费金额（消费金额 = 商品单价 × 购买数量）
2. 找出消费金额最高的用户，并输出其所有信息
3. 计算所有用户的平均消费金额（保留2位小数）
4. 统计电子产品的总购买数量
'''
import pandas as pd

data = {
    '用户ID': [101, 102, 103, 104, 105],
    '用户名': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    '商品类别': ['电子产品', '服饰', '电子产品', '家居', '服饰'],
    '商品单价': [1200, 300, 800, 150, 200],
    '购买数量': [1, 3, 2, 5, 4]
}

df = pd.DataFrame(data)
df

Unnamed: 0,用户ID,用户名,商品类别,商品单价,购买数量
0,101,Alice,电子产品,1200,1
1,102,Bob,服饰,300,3
2,103,Charlie,电子产品,800,2
3,104,David,家居,150,5
4,105,Eve,服饰,200,4


In [65]:
df['总消费金额'] = df['商品单价']*df['购买数量']
df

Unnamed: 0,用户ID,用户名,商品类别,商品单价,购买数量,总消费金额
0,101,Alice,电子产品,1200,1,1200
1,102,Bob,服饰,300,3,900
2,103,Charlie,电子产品,800,2,1600
3,104,David,家居,150,5,750
4,105,Eve,服饰,200,4,800


In [66]:
df.nlargest(1,columns=['总消费金额'])

Unnamed: 0,用户ID,用户名,商品类别,商品单价,购买数量,总消费金额
2,103,Charlie,电子产品,800,2,1600


In [67]:
df['总消费金额'].mean()

1050.0

In [68]:
df[df['商品类别']=='电子产品']['购买数量'].sum()

3

## 4.

In [29]:
#数据变形
import pandas as pd
data = {
    'ID': [1, 2],
    'name':['张三','李四'],
    'Math': [90, 85],
    'English': [88, 92],
    'Science': [95, 89]
}
df = pd.DataFrame(data)
df
df.T
#宽表转长表
df2= pd.melt(df, id_vars=['ID','name'], var_name='科目', value_name='分数')
df2.sort_values(by=['name','科目'])
#长表转宽表
df3=pd.pivot(df2,index=['ID','name'],columns=['科目'],values='分数')
#分列
data = {
    'ID': [1, 2],
    'name':['alice smith','bob jack'],
    'Math': [90, 85],
    'English': [88, 92],
    'Science': [95, 89]
}
df = pd.DataFrame(data)
df[['first name','last name']] = df['name'].str.split(' ')
# 加载数据
df = pd.read_csv("data/sleep.csv")

df=df[['person_id','blood_pressure']]
df[['high','low']]=df['blood_pressure'].str.split('/',expand=True)
df

## 5. 

In [78]:
# 加载数据
df_employees = pd.read_csv("data/employees.csv")
df_employees


Unnamed: 0,employee_id,first_name,last_name,email,phone_number,job_id,salary,commission_pct,manager_id,department_id
0,100,Steven,King,SKING,515.123.4567,AD_PRES,24000.0,,,90.0
1,101,N_ann,Kochhar,NKOCHHAR,515.123.4568,AD_VP,17000.0,,100.0,90.0
2,102,Lex,De Haan,LDEHAAN,515.123.4569,AD_VP,17000.0,,100.0,90.0
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,IT_PROG,9000.0,,102.0,60.0
4,104,Bruce,Ernst,BERNST,590.423.4568,IT_PROG,6000.0,,103.0,60.0
...,...,...,...,...,...,...,...,...,...,...
102,202,Pat,Fay,PFAY,603.123.6666,MK_REP,6000.0,,201.0,20.0
103,203,Susan,Mavris,SMAVRIS,515.123.7777,HR_REP,6500.0,,101.0,40.0
104,204,Hermann,Baer,HBAER,515.123.8888,PR_REP,10000.0,,101.0,70.0
105,205,Shelley,Higgins,SHIGGINS,515.123.8080,AC_MGR,12000.0,,101.0,110.0


In [None]:
# 加载数据
df_employees = pd.read_csv("data/employees.csv")
df_employees
# 1. 将first_name首字母大写，其余小写
df_employees['first_name'] = df_employees['first_name'].str.capitalize()

# 2. 提取邮箱域名（@后部分）
df_employees['email_domain'] = df_employees['email'].str.extract(r'@(.+)')

print(df_employees[['first_name', 'email', 'email_domain']].head())