3.1 数据结构

3.1.1Series

Series(data = None, index = None, dtype = None, name = None, copy = False, fastpath = False)

In [330]:
import pandas as pd

In [331]:
ser_obj = pd.Series(data = ['Python', 'C', 'Java', 'JavaScript', 'PHP', 'R'])
ser_obj

0        Python
1             C
2          Java
3    JavaScript
4           PHP
5             R
dtype: object

In [332]:
# 显式给数据指定标签索引
ser_obj = pd.Series(data = ['Python', 'C', 'Java', 'JavaScript', 'PHP', 'R'], index = ['a', 'b', 'c', 'd', 'e', 'f'])
ser_obj

a        Python
b             C
c          Java
d    JavaScript
e           PHP
f             R
dtype: object

In [333]:
# 除通过列表创建外，还可以通过字典创建
year_dict = {'a':2022, 'b':2023, 'c':2024, 'd':2025, 'e':2026, 'f':2027}
ser_obj = pd.Series(data = year_dict)
ser_obj

a    2022
b    2023
c    2024
d    2025
e    2026
f    2027
dtype: int64

In [334]:
ser_obj.index # 获取索引

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

In [335]:
ser_obj.values # 获取数据

array([2022, 2023, 2024, 2025, 2026, 2027], dtype=int64)

3.1.2DataFrame

In [336]:
import numpy as np
import pandas as pd

In [337]:
arr_2d = np.arange(1, 31).reshape((6, 5)) # 创建二维数组
df_obj = pd.DataFrame(data = arr_2d)
df_obj

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,5
1,6,7,8,9,10
2,11,12,13,14,15
3,16,17,18,19,20
4,21,22,23,24,25
5,26,27,28,29,30


In [338]:
# 创建并指定列索引
df_obj = pd.DataFrame(data = arr_2d, columns = ['No1', 'No2', 'No3', 'No4', 'No5'])
df_obj

Unnamed: 0,No1,No2,No3,No4,No5
0,1,2,3,4,5
1,6,7,8,9,10
2,11,12,13,14,15
3,16,17,18,19,20
4,21,22,23,24,25
5,26,27,28,29,30


In [339]:
result = df_obj.No2
result

0     2
1     7
2    12
3    17
4    22
5    27
Name: No2, dtype: int32

In [340]:
type(result) # 查看类型

pandas.core.series.Series

In [341]:
df_obj.info() # 查看df_obj对象的摘要信息

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   No1     6 non-null      int32
 1   No2     6 non-null      int32
 2   No3     6 non-null      int32
 3   No4     6 non-null      int32
 4   No5     6 non-null      int32
dtypes: int32(5)
memory usage: 252.0 bytes


3.2索引和切片操作

3.2.1索引对象

1.不可变性

In [342]:
ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])
ser_index = ser_obj.index
ser_index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [343]:
ser_index[2]

'c'

In [344]:
# ser_index['2'] = 'cc'

2.可重复性

In [345]:
ser_obj = pd.Series(range(5), index = ['a', 'a', 'c', 'd', 'e'])
ser_index = ser_obj.index
ser_index

Index(['a', 'a', 'c', 'd', 'e'], dtype='object')

In [346]:
ser_index.is_unique # 判断索引的值是否是唯一的

False

3.2.2重置索引

reindex(label = None, index = None, column = None, axis = None, method = None, copy = None, level = None, fill_value = nan, limit = None, tolerance = None)

In [347]:
df_obj = pd.DataFrame({'no1':[1.0, 2.0, 3.0], 'no2':[4.0, 5.0, 6.0]}, index = ['a', 'b', 'c'])
df_obj

Unnamed: 0,no1,no2
a,1.0,4.0
b,2.0,5.0
c,3.0,6.0


In [348]:
new_df = df_obj.reindex(index = ['a', 'c', 'e']) # 重置索引
new_df

Unnamed: 0,no1,no2
a,1.0,4.0
c,3.0,6.0
e,,


In [349]:
new_df = df_obj.reindex(index = ['a', 'c', 'e'], fill_value = 9) # 重置索引，并指定填充的值
new_df

Unnamed: 0,no1,no2
a,1.0,4.0
c,3.0,6.0
e,9.0,9.0


In [350]:
new_df = df_obj.reindex(index = ['a', 'c', 'e'], method = 'ffill') # 重置索引，并指定填充的值
new_df

Unnamed: 0,no1,no2
a,1.0,4.0
c,3.0,6.0
e,3.0,6.0


3.2.3 通过索引和切片获取数据

1、Series的索引和切片操作

In [351]:
ser_obj = pd.Series([10, 20, 30, 40, 50], index = ['one', 'two', 'three', 'four', 'five'])
ser_obj[2]

30

In [352]:
ser_obj['three']

30

In [353]:
ser_obj[[0, 2, 3]]

one      10
three    30
four     40
dtype: int64

In [354]:
ser_obj[['one', 'three', 'four']]

one      10
three    30
four     40
dtype: int64

In [355]:
ser_obj > 20

one      False
two      False
three     True
four      True
five      True
dtype: bool

In [356]:
ser_obj[ser_obj > 20]

three    30
four     40
five     50
dtype: int64

In [357]:
ser_obj[1 : 3]

two      20
three    30
dtype: int64

In [358]:
ser_obj['two' : 'four']

two      20
three    30
four     40
dtype: int64

2、DataFrame的索引和切片操作

In [359]:
arr = np.arange(12).reshape(3, 4)
df_obj = pd.DataFrame(arr, index = ['row_1', 'row_2', 'row_3'], columns = ['col_1', 'col_2', 'col_3', 'col_4'])
df_obj

Unnamed: 0,col_1,col_2,col_3,col_4
row_1,0,1,2,3
row_2,4,5,6,7
row_3,8,9,10,11


In [360]:
df_obj['col_2']

row_1    1
row_2    5
row_3    9
Name: col_2, dtype: int32

In [361]:
df_obj[['col_1', 'col_3']]

Unnamed: 0,col_1,col_3
row_1,0,2
row_2,4,6
row_3,8,10


In [362]:
df_obj[1 : 3]

Unnamed: 0,col_1,col_2,col_3,col_4
row_2,4,5,6,7
row_3,8,9,10,11


In [363]:
df_obj[1 : 3][['col_1', 'col_3']]

Unnamed: 0,col_1,col_3
row_2,4,6
row_3,8,10


3.2.4 通过loc和iloc属性获取数据

1、loc属性

In [364]:
# 使用格式
# Series.loc[参数]
# DataFrame.loc[参数1, 参数2]

In [365]:
ser_obj = pd.Series([10, 20, 30, 40, 50], index = ['row1', 'row2', 'row3', 'row4', 'row5'])
ser_obj.loc['row2']

20

In [366]:
ser_obj.loc[['row2', 'row5']]

row2    20
row5    50
dtype: int64

In [367]:
ser_obj.loc['row3': 'row5']

row3    30
row4    40
row5    50
dtype: int64

In [368]:
ser_bool = ser_obj < 30
ser_obj.loc[ser_bool]

row1    10
row2    20
dtype: int64

In [369]:
arr = np.arange(12).reshape(3, 4)
df_obj = pd.DataFrame(arr, index = ['row_1', 'row_2', 'row_3'], columns = ['col_1', 'col_2', 'col_3', 'col_4'])
df_obj

Unnamed: 0,col_1,col_2,col_3,col_4
row_1,0,1,2,3
row_2,4,5,6,7
row_3,8,9,10,11


In [370]:
df_obj.loc['row_1']

col_1    0
col_2    1
col_3    2
col_4    3
Name: row_1, dtype: int32

In [371]:
df_obj.loc[['row_1', 'row_3']]

Unnamed: 0,col_1,col_2,col_3,col_4
row_1,0,1,2,3
row_3,8,9,10,11


In [372]:
df_obj.loc['row_1':'row_2']

Unnamed: 0,col_1,col_2,col_3,col_4
row_1,0,1,2,3
row_2,4,5,6,7


In [373]:
df_obj.loc[[True, False, True]]

Unnamed: 0,col_1,col_2,col_3,col_4
row_1,0,1,2,3
row_3,8,9,10,11


In [374]:
df_obj.loc['row_1', 'col_3']

2

In [375]:
df_obj.loc['row_1':'row_3',['col_1', 'col_3']]

Unnamed: 0,col_1,col_3
row_1,0,2
row_2,4,6
row_3,8,10


2、iloc属性

In [376]:
# 使用格式
# Series.iloc[参数]
# DataFrame.iloc[参数1, 参数2]

In [377]:
df_obj.iloc[0]

col_1    0
col_2    1
col_3    2
col_4    3
Name: row_1, dtype: int32

In [378]:
df_obj.iloc[[0, 2]]

Unnamed: 0,col_1,col_2,col_3,col_4
row_1,0,1,2,3
row_3,8,9,10,11


In [379]:
df_obj.iloc[0: 2]

Unnamed: 0,col_1,col_2,col_3,col_4
row_1,0,1,2,3
row_2,4,5,6,7


In [380]:
df_obj.iloc[[True, False, True]]

Unnamed: 0,col_1,col_2,col_3,col_4
row_1,0,1,2,3
row_3,8,9,10,11


In [381]:
df_obj.iloc[2, 2]

10

In [382]:
df_obj.iloc[0:3, [0, 2]]

Unnamed: 0,col_1,col_3
row_1,0,2
row_2,4,6
row_3,8,10


3.3 读写数据

3.3.1 读写CSV和TXT文件的数据

1、通过to_csv()方法向文本文件写入数据

In [383]:
df_obj = pd.DataFrame({'编号':['CNN001','CNN002','CNN003','CNN004',
                             'CNN005','CNN006','CNN007','CNN008','CNN009','CNN010'],
                      '姓名':['小明','小红','小蓝','小黑','小白','小方',
                            '小梅','小刚','小丽','小花'],
                      '性别':['男','女','女','男','男','女','女','男','女','女'],
                      '部门':['行政','人力资源','销售','研发','财务','技术','',
                            '市场','研发','技术'],
                      '职务':['员工','主管','员工','主管','员工','员工',
                            '员工','员工','主管','员工']})
df_obj

Unnamed: 0,编号,姓名,性别,部门,职务
0,CNN001,小明,男,行政,员工
1,CNN002,小红,女,人力资源,主管
2,CNN003,小蓝,女,销售,员工
3,CNN004,小黑,男,研发,主管
4,CNN005,小白,男,财务,员工
5,CNN006,小方,女,技术,员工
6,CNN007,小梅,女,,员工
7,CNN008,小刚,男,市场,员工
8,CNN009,小丽,女,研发,主管
9,CNN010,小花,女,技术,员工


In [384]:
df_obj.to_csv(r'employee_info.csv', index = False, encoding = 'gbk')
print('写入完毕')

写入完毕


2、通过read_csv()方法向文本文件读取数据

In [385]:
df_obj = pd.read_csv(r'employee_info.csv', encoding = 'gbk')
df_obj

Unnamed: 0,编号,姓名,性别,部门,职务
0,CNN001,小明,男,行政,员工
1,CNN002,小红,女,人力资源,主管
2,CNN003,小蓝,女,销售,员工
3,CNN004,小黑,男,研发,主管
4,CNN005,小白,男,财务,员工
5,CNN006,小方,女,技术,员工
6,CNN007,小梅,女,,员工
7,CNN008,小刚,男,市场,员工
8,CNN009,小丽,女,研发,主管
9,CNN010,小花,女,技术,员工


In [386]:
df_obj = pd.read_table(r'employee_info.csv', encoding = 'gbk')
df_obj

Unnamed: 0,"编号,姓名,性别,部门,职务"
0,"CNN001,小明,男,行政,员工"
1,"CNN002,小红,女,人力资源,主管"
2,"CNN003,小蓝,女,销售,员工"
3,"CNN004,小黑,男,研发,主管"
4,"CNN005,小白,男,财务,员工"
5,"CNN006,小方,女,技术,员工"
6,"CNN007,小梅,女,,员工"
7,"CNN008,小刚,男,市场,员工"
8,"CNN009,小丽,女,研发,主管"
9,"CNN010,小花,女,技术,员工"


In [387]:
df_obj.head()

Unnamed: 0,"编号,姓名,性别,部门,职务"
0,"CNN001,小明,男,行政,员工"
1,"CNN002,小红,女,人力资源,主管"
2,"CNN003,小蓝,女,销售,员工"
3,"CNN004,小黑,男,研发,主管"
4,"CNN005,小白,男,财务,员工"


In [388]:
df_obj.head(3)

Unnamed: 0,"编号,姓名,性别,部门,职务"
0,"CNN001,小明,男,行政,员工"
1,"CNN002,小红,女,人力资源,主管"
2,"CNN003,小蓝,女,销售,员工"


In [389]:
df_obj.tail()

Unnamed: 0,"编号,姓名,性别,部门,职务"
5,"CNN006,小方,女,技术,员工"
6,"CNN007,小梅,女,,员工"
7,"CNN008,小刚,男,市场,员工"
8,"CNN009,小丽,女,研发,主管"
9,"CNN010,小花,女,技术,员工"


In [390]:
df_obj.tail(3)

Unnamed: 0,"编号,姓名,性别,部门,职务"
7,"CNN008,小刚,男,市场,员工"
8,"CNN009,小丽,女,研发,主管"
9,"CNN010,小花,女,技术,员工"


3.3.2 读写EXCEL文件的数据

1、通过to_excel()方法向Excel文件写入数据

In [391]:
df_obj = pd.DataFrame({
    '手机名称':['华为mate50 Pro','华为畅享 50 Pro','华为 P50','华为智选 优畅享50','华为P50 Pocket'],
    '机身内存':['256GB','256GB','128GB','128GB','256GB'],
    '运行内存':['8GB','8GB','8GB','8GB','8GB'],
    '颜色':['耀金黑','幻夜黑','可可茶金','月光银','云锦白'],
    '价格':['6799','1799','3758','999','8188']})
df_obj

Unnamed: 0,手机名称,机身内存,运行内存,颜色,价格
0,华为mate50 Pro,256GB,8GB,耀金黑,6799
1,华为畅享 50 Pro,256GB,8GB,幻夜黑,1799
2,华为 P50,128GB,8GB,可可茶金,3758
3,华为智选 优畅享50,128GB,8GB,月光银,999
4,华为P50 Pocket,256GB,8GB,云锦白,8188


In [392]:
df_obj.to_excel(r'phones.xlsx')
print('写入完毕')

写入完毕


2、通过read_excel()方法向Excel文件读取数据

In [393]:
df_obj = pd.read_excel(r'phones.xlsx')
df_obj

Unnamed: 0.1,Unnamed: 0,手机名称,机身内存,运行内存,颜色,价格
0,0,华为mate50 Pro,256GB,8GB,耀金黑,6799
1,1,华为畅享 50 Pro,256GB,8GB,幻夜黑,1799
2,2,华为 P50,128GB,8GB,可可茶金,3758
3,3,华为智选 优畅享50,128GB,8GB,月光银,999
4,4,华为P50 Pocket,256GB,8GB,云锦白,8188


In [394]:
df_obj = pd.read_excel(r'phones.xlsx', usecols = [1,2,3,4,5])
df_obj

Unnamed: 0,手机名称,机身内存,运行内存,颜色,价格
0,华为mate50 Pro,256GB,8GB,耀金黑,6799
1,华为畅享 50 Pro,256GB,8GB,幻夜黑,1799
2,华为 P50,128GB,8GB,可可茶金,3758
3,华为智选 优畅享50,128GB,8GB,月光银,999
4,华为P50 Pocket,256GB,8GB,云锦白,8188


3.3.3 读取网页表格的数据

In [395]:
tables = pd.read_html(io = 'https://www.tiobe.com/tiobe-index/')
tables[3]

Unnamed: 0,Year,Winner
0,2024,Python
1,2023,C#
2,2022,C++
3,2021,Python
4,2020,Python
5,2019,C
6,2018,Python
7,2017,C
8,2016,Go
9,2015,Java


3.3.4 读写数据库

1、通过to_sql()方法向数据库写入数据

In [398]:
from sqlalchemy import create_engine
from sqlalchemy.types import *
df = DataFrame({"班级":["一年级","二年级","三年级","四年级"],
               "男生人数":[25,23,27,30]，
               "女生人数":[19,17,20,20]})

SyntaxError: invalid character '，' (U+FF0C) (1365138720.py, line 4)

In [None]:
from sqlalchemy import create_engine
# engine = create_engine('mysql+pymysql://root:123456'
#                        '@127.0.0.1/students_info')
# df.to_sql('students', engine)

2、通过read_sql()方法向数据库读取数据

In [None]:
# engine = create_engine('mysql+pymysql://root:123456'
#                        '@127.0.0.1/students_info')
# df.read_sql('person_info', engine)

In [None]:
# engine = create_engine('mysql+pymysql://root:123456'
#                        '@127.0.0.1/students_info')
# sql = 'select * from person_info where id > 3;'
# df.read_sql(sql, engine)

3.4 数据排序

3.4.1 按索引排序

In [None]:
df_obj = pd.DataFrame(np.arange(9).reshape(3, 3), index = [4, 3, 5])
df_obj

In [None]:
df_obj.sort_index()

In [None]:
df_obj.sort_index(ascending = False)

3.4.2 按值排序

In [None]:
ser_obj = pd.Series([4, np.NaN, 6, np.nan, -3, 2])
ser_obj

In [None]:
df_obj = pd.DataFrame([[0.4, -0.1, -0.3, 0.0],
                       [0.2, 0.6, -0.1, -0.7],
                       [0.8, 0.6, -0.5, 0.1]])
df_obj

In [None]:
df_obj.sort_values(by = 2)

3.5 算术运算与数据对齐

In [None]:
obj_one = pd.Series(range(10,13), index = range(3))
obj_one

In [None]:
obj_two = pd.Series(range(20,25), index = range(5))
obj_two

In [None]:
obj_one + obj_two

In [None]:
obj_one.add(obj_two ,fill_value = 0)

3.6 统计计算与描述

3.6.1 统计计算

In [None]:
df_obj = pd.DataFrame(np.arange(12).reshape(3, 4), columns = ['a', 'b', 'c', 'd'])
df_obj

In [None]:
df_obj.sum()

In [None]:
df_obj.max()

In [None]:
df_obj.min(axis = 1)

3.6.2 统计描述

In [None]:
df_obj = pd.DataFrame([[12, 6, -11, 19],
                       [-1, 7, 50, 36],
                       [5, 9, 23, 28]])
df_obj

In [None]:
df_obj.describe()

3.7 分层索引操作

3.7.1 创建分层索引

1.通过from_tuples()方法创建MultiIndex类的对象

In [None]:
from pandas import MultiIndex
list_tuples = [('A', 'A1'), ('A', 'A2'), ('B', 'B1'), ('B', 'B2'), ('B', 'B3')]
multi_index = MultiIndex.from_tuples(tuples = list_tuples, names = ['外层索引', '内层索引'])
multi_index

2.通过from_arrays()方法创建MultiIndex类的对象

In [None]:
multi_array = MultiIndex.from_arrays(arrays = [['A', 'B', 'A', 'B', 'B'], ['A1', 'A2', 'B1', 'B2', 'B3']] , names = ['外层索引', '内层索引'])
multi_array

3.通过from_product()方法创建MultiIndex类的对象

In [None]:
numbers = [0, 1, 2]
colors = ['green', 'purple']
multi_product = pd.MultiIndex.from_product(iterables = [numbers, colors], names = ['number', 'color'])
multi_product

3.7.2 创建有分层索引的对象

In [None]:
mulitindex_series = pd.Series([14530, 13829, 12047, 7813, 7568, 6239, 15236, 8291], index = [['河北省','河北省','河北省','河北省','河南省','河南省','河南省','河南省'],['石家庄市','唐山市','邯郸市','秦皇岛市','郑州市','开封市','洛阳市','新乡市']])
mulitindex_series

In [None]:
from pandas import DataFrame, Series
mulitindex_df = DataFrame({'占地面积':[14530, 13829, 12047, 7813, 7568, 6239, 15236, 8291]}, index = [['河北省','河北省','河北省','河北省','河南省','河南省','河南省','河南省'],['石家庄市','唐山市','邯郸市','秦皇岛市','郑州市','开封市','洛阳市','新乡市']])
mulitindex_df

In [None]:
values = np.array([[7,5], [6,6], [3,1], [5,5], [4,5], [5,3]])
df_product = pd.DataFrame(data = values, index = multi_product)
df_product

3.7.3 使用分层索引获取数据

In [None]:
from pandas import Series
ser_obj = Series([50, 60, 40, 94, 63, 101, 200, 56, 45], 
                index = [['小说','小说','小说','散文随笔','散文随笔','散文随笔','传记','传记','传记'] ,
                        ['平凡的世界', '骆驼祥子', '狂人日记', '皮囊', '浮生六记', '自在独行', '曾国藩', '老舍自传', '知行合一王阳明']])
ser_obj

In [None]:
ser_obj['小说']

In [None]:
ser_obj['小说', '平凡的世界']

In [None]:
ser_obj[:, '自在独行']

3.7.4 交换索引层级的顺序

In [None]:
ser_obj.swaplevel()

3.7.5 分层索引排序

In [None]:
df_obj = DataFrame({'word':['a','b','d','e','f','k','d','s','l'],
                   'num':[1,2,4,5,3,2,6,2,3]},
                  index = [['A','A','A','C','C','C','B','B','B'],
                          [1,3,2,3,1,2,4,5,8]])
df_obj

In [None]:
df_obj.sort_index()