In [1]:
import pandas as pd
import numpy as np

## 1数据转换:函数应用

In [2]:
df=pd.DataFrame(np.arange(12).reshape(3,4),index=['s1','s2','s3'],columns=['c1','c2','c3','c4'])
df

Unnamed: 0,c1,c2,c3,c4
s1,0,1,2,3
s2,4,5,6,7
s3,8,9,10,11


#### 1.1apply 方法

In [None]:
# 按列应用函数：对每一列求和（默认 axis=0）
print(df.apply(np.sum))
# 按列应用 lambda：每列的范围（max-min）
print(df.apply(lambda col: col.max()-col.min()))
# 按行应用函数：计算每行 c1+c2（axis=1）
print(df.apply(lambda row: row['c1'] + row['c2'], axis=1))
# 对 Series 使用 apply：对 c1 列逐元素平方
print(df['c1'].apply(lambda x: x**2))
# 对 DataFrame 逐元素应用：每个元素加 1（applymap 在新版本中已弃用）
print(df.applymap(lambda x: x + 1))
# 自定义函数：对每一列做标准化（Z-Score）
def normalize(s):
    return (s - s.mean())/s.std()
print(df.apply(normalize))
# 行级返回多个指标：返回 Series 自动拼成新 DataFrame
print(df.apply(lambda row: pd.Series({'sum': row.sum(), 'mean': row.mean()}), axis=1))


c1    12
c2    15
c3    18
c4    21
dtype: int64
c1    8
c2    8
c3    8
c4    8
dtype: int64
s1     1
s2     9
s3    17
dtype: int64
s1     0
s2    16
s3    64
Name: c1, dtype: int64
    c1  c2  c3  c4
s1   1   2   3   4
s2   5   6   7   8
s3   9  10  11  12
     c1   c2   c3   c4
s1 -1.0 -1.0 -1.0 -1.0
s2  0.0  0.0  0.0  0.0
s3  1.0  1.0  1.0  1.0
     sum  mean
s1   6.0   1.5
s2  22.0   5.5
s3  38.0   9.5


  print(df.applymap(lambda x: x + 1))


### 1.2map 方法

In [None]:
import pandas as pd

# 示例：学习 DataFrame 的 map 方法
# 注意：map 只能作用于 Series（即单列），不能对整个 DataFrame 使用
# 如果想对整个 DataFrame 逐元素操作，请使用 applymap

# 1. 创建一个示例 DataFrame
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Cathy', 'David'],
    'score': [95, 82, 78, 90],
    'gender': ['F', 'M', 'F', 'M']
})

# 2. 对 name 列使用 map：把名字变成大写
df['name_upper'] = df['name'].map(lambda x: x.upper())

# 3. 对 score 列使用 map：把分数变成等级
def score_to_level(s):
    if s >= 90:
        return 'A'
    elif s >= 80:
        return 'B'
    else:
        return 'C'

df['level'] = df['score'].map(score_to_level)

# 4. 使用字典映射：把性别简称换成全称
gender_map = {'M': 'Male', 'F': 'Female'}
df['gender_full'] = df['gender'].map(gender_map)  # 假设已有 gender 列

# 5. 查看结果
print(df)

    name  score gender name_upper level gender_full
0  Alice     95      F      ALICE     A      Female
1    Bob     82      M        BOB     B        Male
2  Cathy     78      F      CATHY     C      Female
3  David     90      M      DAVID     A        Male


In [10]:
# 单列统计：统计某一列的出现次数
from collections import Counter

def single_column_stats(data, column):
    """
    对指定列进行出现次数统计
    :param data: 可迭代的数据集，如列表、DataFrame等
    :param column: 需要统计的列名或列索引
    :return: Counter对象，键为列值，值为出现次数
    """
    if hasattr(data, 'iterrows'):  # 处理DataFrame
        values = [row[column] for _, row in data.iterrows()]
    else:  # 处理列表或其他可迭代对象
        values = [item[column] if isinstance(item, (dict, list)) else item for item in data]
    return Counter(values)

# 使用示例
stats = single_column_stats(df, 'score')
print(stats.most_common())
print("分数:")
for score, count in stats.items():
    print(f"{score}: {count}人")


[(95, 1), (82, 1), (78, 1), (90, 1)]
分数:
95: 1人
82: 1人
78: 1人
90: 1人


In [13]:
#聚合计算
data={
    "name":["Alice","Bob","Charlie","David","Eve"],
    "age":[25,30,35,40,45],
    "score":[100,90,80,70,60],
    "gender":["F","M","M","M","F"]
}
data

{'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
 'age': [25, 30, 35, 40, 45],
 'score': [100, 90, 80, 70, 60],
 'gender': ['F', 'M', 'M', 'M', 'F']}

In [None]:
#聚合计算
#1.分组后统计
group_by_gender=df.groupby("gender")
print("分组后统计")
print(group_by_gender.describe())# 对每个分组进行描述统计


分组后统计
       score                                                 
       count  mean        std   min    25%   50%    75%   max
gender                                                       
F        2.0  86.5  12.020815  78.0  82.25  86.5  90.75  95.0
M        2.0  86.0   5.656854  82.0  84.00  86.0  88.00  90.0


In [15]:
#2.分组后求平均值
print("分组后求平均值")
mean_score=df.groupby("gender")["score"].mean().add_prefix("mean_")
print(mean_score)# 对每个分组求平均值


分组后求平均值
gender
mean_F    86.5
mean_M    86.0
Name: score, dtype: float64


In [19]:
#4.分组后进行转换
df["max_score"]=df.groupby("gender")["score"].transform("max")
df["avg_score"]=df.groupby("gender")["score"].transform("mean")
df["min_score"]=df.groupby("gender")["score"].transform("min")
print("分组后进行转换")
print(df)

分组后进行转换
    name  score gender name_upper level gender_full  max_score  avg_score  \
0  Alice     95      F      ALICE     A      Female         95       86.5   
1    Bob     82      M        BOB     B        Male         90       86.0   
2  Cathy     78      F      CATHY     C      Female         95       86.5   
3  David     90      M      DAVID     A        Male         90       86.0   

   min_score  
0         78  
1         82  
2         78  
3         82  


In [25]:
print(df[["name","gender","score","avg_score"]])

    name gender  score  avg_score
0  Alice      F     95       86.5
1    Bob      M     82       86.0
2  Cathy      F     78       86.5
3  David      M     90       86.0
