# 3.3 Pandas之DataFrame

## 1.DataFrame的创建方式

In [1]:
import numpy as np
import pandas as pd

In [2]:
# 通过series来创建
s1 = pd.Series([1,2,3,4,5])
s2 = pd.Series([6,7,8,9,10])
df = pd.DataFrame({"第1列": s1, "第2列": s2})

df

Unnamed: 0,第1列,第2列
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [3]:
# 通过字典来创建
df = pd.DataFrame(
    {
        "name": ["tom", "jack", "alice", "bob", "allen"],
        "age": [15, 17, 20, 26, 30],
        "score": [60.5, 80, 63, 70, 85.5]
    }, index=[1, 2, 3, 4, 5], columns=["name", "age", "score"]
)

df

Unnamed: 0,name,age,score
1,tom,15,60.5
2,jack,17,80.0
3,alice,20,63.0
4,bob,26,70.0
5,allen,30,85.5


# 2. DataFrame的属性

In [4]:
print(f'行索引：{df.index}')
print(f'列索引：{df.columns}')
print(f'值：{df.values}')

行索引：Index([1, 2, 3, 4, 5], dtype='int64')
列索引：Index(['name', 'age', 'score'], dtype='object')
值：[['tom' 15 60.5]
 ['jack' 17 80.0]
 ['alice' 20 63.0]
 ['bob' 26 70.0]
 ['allen' 30 85.5]]


In [5]:
print('维度：', df.ndim)
print('形状：', df.shape)
print('元素个数：', df.size)
print('元素类型：', df.dtypes)

维度： 2
形状： (5, 3)
元素个数： 15
元素类型： name      object
age        int64
score    float64
dtype: object


In [6]:
# 行列转置
print(df.T)

          1     2      3     4      5
name    tom  jack  alice   bob  allen
age      15    17     20    26     30
score  60.5  80.0   63.0  70.0   85.5


## 3. DataFrame获取数据

In [9]:
df

Unnamed: 0,name,age,score
1,tom,15,60.5
2,jack,17,80.0
3,alice,20,63.0
4,bob,26,70.0
5,allen,30,85.5


In [8]:
# 获取元素 loc显     iloc隐       at      iat
# 某行
print(df.loc[4])

print('-' * 20)

print(df.iloc[3])

name      bob
age        26
score    70.0
Name: 4, dtype: object
--------------------
name      bob
age        26
score    70.0
Name: 4, dtype: object


In [10]:
# 某列
print(df.loc[:, 'name'])
print(df.iloc[:, 0])

1      tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
1      tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object


In [13]:
# 单个元素
print(df.at[3, 'score'])
print(df.iat[2, 1])
print(df.loc[3, 'score'])
print(df.iloc[2, 1])

63.0
20
63.0
20


In [19]:
# 获取单列元素
print(df['name'])
print(type(df['name']))
print(df.name)
print(type(df.name))
print(df[['name']])
print(type(df[['name']]))
df[['name']]

1      tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
<class 'pandas.core.series.Series'>
1      tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
<class 'pandas.core.series.Series'>
    name
1    tom
2   jack
3  alice
4    bob
5  allen
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,name
1,tom
2,jack
3,alice
4,bob
5,allen


In [20]:
# 多列数据的获取
df[['name', 'score']]

Unnamed: 0,name,score
1,tom,60.5
2,jack,80.0
3,alice,63.0
4,bob,70.0
5,allen,85.5


In [21]:
# 查看部分数据
df.head(2)

Unnamed: 0,name,age,score
1,tom,15,60.5
2,jack,17,80.0


In [22]:
df.tail(2)

Unnamed: 0,name,age,score
4,bob,26,70.0
5,allen,30,85.5


In [25]:
# 使用布尔索引筛选数据
df[df.score > 70]
df[(df['score'] > 70) & (df.age<20)]

Unnamed: 0,name,age,score
2,jack,17,80.0


In [26]:
# 随机抽样
df.sample(3)

Unnamed: 0,name,age,score
5,allen,30,85.5
1,tom,15,60.5
2,jack,17,80.0


## 4. DataFrame的常用方法

In [27]:
df = pd.DataFrame(
    {
        "name": ["tom", "jack", "alice", "bob", "allen"],
        "age": [15, 17, 20, 26, 30],
        "score": [60.5,80,30.6,70,83.5]
    }, index=[1, 2, 3, 4, 5], columns=["name", "age", "score"]
)

df

Unnamed: 0,name,age,score
1,tom,15,60.5
2,jack,17,80.0
3,alice,20,30.6
4,bob,26,70.0
5,allen,30,83.5


In [29]:
df.head()

df.tail(1)

Unnamed: 0,name,age,score
5,allen,30,83.5


In [30]:
df.isin(['jack', 20])

Unnamed: 0,name,age,score
1,False,False,False
2,True,False,False
3,False,True,False
4,False,False,False
5,False,False,False


In [31]:
df.isna()

Unnamed: 0,name,age,score
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False


In [40]:
print(df['score'].sum())
print(df.score.max())
print(df.age.min())
print(df.score.mean())
print(df.score.median())
print(df.age.mode())

324.6
83.5
15
64.92
70.0
0    15
1    17
2    20
3    26
4    30
Name: age, dtype: int64


In [42]:
df = pd.DataFrame(
    {
        "name":["tom","tom",'jack','alice','bob','allen'],
        "age":[15,15,15,20,26,30],
        "score":[60.5,60.5,80,30.6,70,83.5]
    },index=[1,2,3,4,5,6],columns=["name","score","age"]
)
df

Unnamed: 0,name,score,age
1,tom,60.5,15
2,tom,60.5,15
3,jack,80.0,15
4,alice,30.6,20
5,bob,70.0,26
6,allen,83.5,30


In [44]:
print(df.score.std()) # 标准差
print(df.score.var()) # 方差
print(df.score.quantile(0.25)) # 分位数

19.037375519400427
362.4216666666667
60.5


In [45]:
df.describe()

Unnamed: 0,score,age
count,6.0,6.0
mean,64.183333,20.166667
std,19.037376,6.493587
min,30.6,15.0
25%,60.5,15.0
50%,65.25,17.5
75%,77.5,24.5
max,83.5,30.0


In [46]:
df.count()

name     6
score    6
age      6
dtype: int64

In [47]:
df.value_counts()

name   score  age
tom    60.5   15     2
alice  30.6   20     1
allen  83.5   30     1
bob    70.0   26     1
jack   80.0   15     1
Name: count, dtype: int64

In [48]:
df.drop_duplicates()

Unnamed: 0,name,score,age
1,tom,60.5,15
3,jack,80.0,15
4,alice,30.6,20
5,bob,70.0,26
6,allen,83.5,30


In [49]:
# 查看是否重复
df.duplicated(subset=['age'])

1    False
2     True
3     True
4    False
5    False
6    False
dtype: bool

In [50]:
# 随机抽样
df.sample(2)

Unnamed: 0,name,score,age
5,bob,70.0,26
1,tom,60.5,15


In [51]:
df.replace(15, 30)

Unnamed: 0,name,score,age
1,tom,60.5,30
2,tom,60.5,30
3,jack,80.0,30
4,alice,30.6,20
5,bob,70.0,26
6,allen,83.5,30


In [53]:
df.cumsum()
df.cummin(axis=0)

Unnamed: 0,name,score,age
1,tom,60.5,15
2,tom,60.5,15
3,jack,60.5,15
4,alice,30.6,15
5,alice,30.6,15
6,alice,30.6,15


In [54]:
df.sort_index(ascending=False)

Unnamed: 0,name,score,age
6,allen,83.5,30
5,bob,70.0,26
4,alice,30.6,20
3,jack,80.0,15
2,tom,60.5,15
1,tom,60.5,15


In [55]:
df.sort_values(by='score')

Unnamed: 0,name,score,age
4,alice,30.6,20
1,tom,60.5,15
2,tom,60.5,15
5,bob,70.0,26
3,jack,80.0,15
6,allen,83.5,30


In [56]:
df = pd.DataFrame(
    {
        "name":["tom","tom",'jack','alice','bob','allen'],
        "age":[15,15,15,20,26,30],
        "score":[60.5,60.5,80,30.6,70,80]
    },index=[1,2,3,4,5,6],columns=["name","score","age"]
)

df

Unnamed: 0,name,score,age
1,tom,60.5,15
2,tom,60.5,15
3,jack,80.0,15
4,alice,30.6,20
5,bob,70.0,26
6,allen,80.0,30


In [57]:
df.sort_values(by=['score', 'age'], ascending=[True, False])

Unnamed: 0,name,score,age
4,alice,30.6,20
1,tom,60.5,15
2,tom,60.5,15
5,bob,70.0,26
6,allen,80.0,30
3,jack,80.0,15


In [59]:
df.nlargest(2, columns=['score', 'age'])
df.nsmallest(2, columns=['score', 'age'])

Unnamed: 0,name,score,age
4,alice,30.6,20
1,tom,60.5,15


# 3.4 DataFrame案例分析

## 1. 学生成绩分析

In [60]:
'''
案例1：学生成绩分析
场景：某班级的学生成绩数据如下，请完成以下任务：
1. 计算每位学生的总分和平均分。
2. 找出数学成绩高于90分或英语成绩高于85分的学生。
3. 按总分从高到低排序，并输出前3名学生。
'''

import pandas as pd
data = {
    '姓名': ['张三', '李四', '王五', '赵六', '钱七'],
    '数学': [85, 92, 78, 88, 95],
    '英语': [90, 88, 85, 92, 80],
    '物理': [75, 80, 88, 85, 90]
}
scores = pd.DataFrame(data)
scores

Unnamed: 0,姓名,数学,英语,物理
0,张三,85,90,75
1,李四,92,88,80
2,王五,78,85,88
3,赵六,88,92,85
4,钱七,95,80,90


In [64]:
# 1. 学生的总分和平均分
scores['总分'] = scores[['数学', '英语', '物理']].sum(axis=1)

scores['平均分'] = scores['总分'] / 3
scores['平均分2'] = scores[['数学', '英语', '物理']].mean(axis=1)

scores

Unnamed: 0,姓名,数学,英语,物理,总分,平均分,平均分2
0,张三,85,90,75,250,83.333333,83.333333
1,李四,92,88,80,260,86.666667,86.666667
2,王五,78,85,88,251,83.666667,83.666667
3,赵六,88,92,85,265,88.333333,88.333333
4,钱七,95,80,90,265,88.333333,88.333333


In [65]:
#2. 找出数学成绩高于90分或英语成绩高于85分的学生
scores[(scores['数学'] > 90) | (scores['英语'] > 85)]

Unnamed: 0,姓名,数学,英语,物理,总分,平均分,平均分2
0,张三,85,90,75,250,83.333333,83.333333
1,李四,92,88,80,260,86.666667,86.666667
3,赵六,88,92,85,265,88.333333,88.333333
4,钱七,95,80,90,265,88.333333,88.333333


In [66]:
# 3. 按总分从高到低排序，并输出前3名学生
scores.sort_values('总分', ascending=False).head(3)

Unnamed: 0,姓名,数学,英语,物理,总分,平均分,平均分2
3,赵六,88,92,85,265,88.333333,88.333333
4,钱七,95,80,90,265,88.333333,88.333333
1,李四,92,88,80,260,86.666667,86.666667


## 2. 销售数据分析

In [67]:
'''
案例2：销售数据分析
场景：某公司销售数据如下，请完成以下任务：
1. 计算每种产品的总销售额（销售额 = 单价 × 销量）。
2. 找出销售额最高的产品。
3. 按销售额从高到低排序，并输出所有产品信息。
'''
import pandas as pd

data = {
    '产品名称': ['A', 'B', 'C', 'D'],
    '单价': [100, 150, 200, 120],
    '销量': [50, 30, 20, 40]
}

df = pd.DataFrame(data)

In [68]:
df['总销售额'] = df['单价'] * df['销量']
df

Unnamed: 0,产品名称,单价,销量,总销售额
0,A,100,50,5000
1,B,150,30,4500
2,C,200,20,4000
3,D,120,40,4800


In [69]:
df.nlargest(1, columns=['总销售额'])

Unnamed: 0,产品名称,单价,销量,总销售额
0,A,100,50,5000


In [70]:
df.sort_values('总销售额', ascending=False)

Unnamed: 0,产品名称,单价,销量,总销售额
0,A,100,50,5000
3,D,120,40,4800
1,B,150,30,4500
2,C,200,20,4000


## 3. 电商用户行为分析

In [1]:
'''
案例3：电商用户行为分析
场景：某电商平台的用户行为数据如下，请完成以下任务：
1. 计算每位用户的总消费金额（消费金额 = 商品单价 × 购买数量）
2. 找出消费金额最高的用户，并输出其所有信息
3. 计算所有用户的平均消费金额（保留2位小数）
4. 统计电子产品的总购买数量
'''
import pandas as pd

data = {
    '用户ID': [101, 102, 103, 104, 105],
    '用户名': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    '商品类别': ['电子产品', '服饰', '电子产品', '家居', '服饰'],
    '商品单价': [1200, 300, 800, 150, 200],
    '购买数量': [1, 3, 2, 5, 4]
}

df = pd.DataFrame(data)
df

Unnamed: 0,用户ID,用户名,商品类别,商品单价,购买数量
0,101,Alice,电子产品,1200,1
1,102,Bob,服饰,300,3
2,103,Charlie,电子产品,800,2
3,104,David,家居,150,5
4,105,Eve,服饰,200,4


In [3]:
df['总消费金额'] = df['商品单价'] * df['购买数量']
df

Unnamed: 0,用户ID,用户名,商品类别,商品单价,购买数量,总消费金额
0,101,Alice,电子产品,1200,1,1200
1,102,Bob,服饰,300,3,900
2,103,Charlie,电子产品,800,2,1600
3,104,David,家居,150,5,750
4,105,Eve,服饰,200,4,800


In [4]:
df.nlargest(1, columns=['总消费金额'])

Unnamed: 0,用户ID,用户名,商品类别,商品单价,购买数量,总消费金额
2,103,Charlie,电子产品,800,2,1600


In [5]:
df['总消费金额'].mean()

1050.0

In [6]:
df[df['商品类别'] == '电子产品']['购买数量'].sum()

3