### pandas的数据结构

In [1]:
import numpy as np
import pandas as pd # Pandas基于Numpy，可以看做是Numpy的升级

ModuleNotFoundError: No module named 'pandas'

#### Series

In [None]:
# Series，Series是一维数组，和Numpy数组不一样。Series多了索引

l = np.array([1,2,3,6,9])

s = pd.Series(l)
display(l,s)

In [None]:
si = pd.Series(data = l,index=list("ABCDE"))
display(si)

In [None]:
s1 = pd.Series(data = {'A':1,'B':2,'C':3,'D':4,'E':5})
display(s1)

#### DataFrame

In [None]:
# Series是一维的，功能比较少
# DataFrame是二维的，多个Series公用索引就组成了DataFrame
df1 = pd.DataFrame(data = np.random.randint(0,151,size = (5,5)),index=list("ABCDE"),columns=list("abcde"))
display(df1)

In [None]:
# 如果传入字典，则：key作为列索引，行索引从0开始
df2 = pd.DataFrame(data = {"A":[1,2,3],"B":[4,5,6],"C":[7,8,9]})
display(df2)

### 数据查看

In [None]:
df = pd.DataFrame(data = np.random.randint(0,151,size = (100,3)),columns=["Math","English","Python"])
display(df)

In [None]:
# 显示图形
df.shape

In [None]:
# 显示前n行元素，默认 n = 5
df.head()

In [None]:
# 显示后n行元素，默认 n = 5
df.tail()

In [None]:
# 查看数据类型
df.dtypes

In [None]:
# 显示详细信息
df.info()

In [None]:
# 描述：计数，平均数，标准差，最小资，1/4点，中位数（1/2）点，3/4点，最大值
df.describe()

In [None]:
# 值，返回的是NumPy数组
df.values

In [None]:
# 列索引
df.columns

In [None]:
# 行索引
df.index

### 数据输入与输出

#### CSV 

In [None]:
df = pd.DataFrame(data  = np.random.randint(0,151,size=(100,3)),columns=["Python","Math","En"])
df

In [None]:
df.to_csv("C:/Users/dreed/Desktop/pandas_data.csv",
          sep = ",", # 保存分隔符
          index = True, # 保存行索引
          header = True # 保存列索引
         )

In [None]:
pd.read_csv("C:/Users/dreed/Desktop/pandas_data.csv",
            index_col=0, # 设置第一列为行索引
            header=0 # 设置第一行为列索引
         
)

#### Excel

In [None]:
# 导出Excel文件，xls格式需要导入xlwt模块，xlsx格式需要导入openpyxl
df.to_excel("C:/Users/dreed/Desktop/pandas_data.xlsx")

In [None]:
pd.read_excel("C:/Users/dreed/Desktop/pandas_data.xlsx",
    index_col = 0, # 设置行索引
    header = 0 # 设置列索引   
             )

#### HDF5

In [None]:
# 需要引入tables这个包
df.to_hdf("C:/Users/dreed/Desktop/pandas_data.df5")

#### SQL

### 数据选取

#### 获取数据 

In [None]:
df = pd.DataFrame(data = np.random.randint(0,151,size = (26,3)),index=list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"),columns = ["Python","Java","Groovy"])
df

In [None]:
df.Python

In [None]:
df["Python"]

In [None]:
df['A'::2]

In [None]:
df[['Python',"Java"]]

#### 标签获取

In [None]:
df.loc['A'::2,'Python'::2]

In [None]:
df.loc[list("ABC"),['Python','Java']]

#### 位置获取

In [None]:
df.iloc[0::5,1:]

#### boolean索引

In [None]:
cond = df.Python > 80
df[cond]

In [None]:
cond = df.mean(axis = 1) > 75
df[cond]

In [None]:
cond = (df.Python > 80) & (df.Java > 100)
df[cond]

In [None]:
cond = df.index.isin(list("XYZMNPQ"))
df[cond]

#### 赋值操作

In [None]:
# 修改对应位置上的数据
df["Python"]["A"] = 150
df

In [None]:
# 新添加一列
df['Scala'] = np.random.randint(0,151,size = 26)
df

In [None]:
df.loc[list("ABC"),"Groovy"] = 100
df

In [None]:
cond = df < 60
df[cond] = 60
df

### 数据集成

#### concat数据串联

In [None]:
# np.concatenate NumPy数据串联
columns = ["Python","Java","Groovy"]
df1 = pd.DataFrame(np.random.randint(0,151,size = (5,3)),index=list("ABCDE"),columns=columns)
df2 = pd.DataFrame(np.random.randint(0,151,size = (5,3)),index=list("MNXYZ"),columns=columns)
df3 = pd.DataFrame(np.random.randint(0,151,size = (5,2)),index=list("PQIJK"),columns=["Shell","Scala"])
display(df1,df2,df3)

In [None]:
pd.concat([df1,df2],axis=0)

In [None]:
pd.concat([df1,df3],axis=1)

#### 数据插入

In [None]:
# 插入一列数据
df1.insert(loc = 1,column = "C++",value = np.random.randint(0,151,size = 5))
df1

In [None]:
# 插入行数据
df1.append(df2)

#### JOIN SQL风格合并

In [None]:
df1 = pd.DataFrame(data = {"name":["张三","李四","王五","赵六"],"height":[175,172,169,183]})
df2 = pd.DataFrame(data = {"name":["张三","李四","王五","安康"],"weight":[150,132,90,120]})
df3 = pd.DataFrame(data = {"名字":["张三","李四","王五","安康"],"salary":[12000,18000,8000,30000]})
display(df1,df2,df3)

In [None]:
# 根据共同的属性合并数据
# df1 和 df2 共同属性：name
# 数据库，合并join，共同key
pd.merge(df1,df2) 

In [None]:
# how 设置合并方式：left、right、outer、inner、cross，默认：inner
pd.merge(df1,df2,how="outer") 

In [None]:
# 指定关联条件
pd.merge(df2,df3,left_on='name',right_on='名字')

In [None]:
df4 = pd.DataFrame(data = np.random.randint(0,151,size = (8,3)),index=list("ABCDEFGH"),columns=["Python","Math","Shell"])
df4

In [None]:
score_mean = df4.mean(axis=1).round(2)

df4.insert(loc = df4.shape[1],column="avg_score",value=score_mean)
df4

### 数据清洗

In [None]:
df = pd.DataFrame(data = {"color":["red","green","blue","blue","red",None,np.NaN,"green","blue"],"price":[18,20,22,22,30,15,15,43,57]})
df

In [None]:
# 删除重复数据,NaN和None是一回事
df.drop_duplicates()

In [None]:
# 删除空数据
df.dropna()

In [None]:
# 删除行
df.drop(index=[0,2,4,6,8])

In [None]:
# 删除列
df.drop(columns='price')

In [None]:
# filter过滤数据
df.filter(items=['color'],axis=1)

In [None]:
df['size'] = 1024 # 广播
df

In [None]:
# 模糊匹配，保留带i的列
df.filter(like='i')

In [None]:
# 使用正则表达式过滤
df.filter(regex="r$")

In [None]:
# 异常值过滤
a = np.random.randint(0,1000,size = 200)
display(a)
cond = (a <= 800) & (a >= 100)
a[cond]

In [None]:
# 正态分布，平均值：0，标准差：1
b = np.random.randn(10000)
b

In [None]:
cond = np.abs(b) > 3 * 1
b[cond]

### 数据转换

#### 轴和袁术转换

In [None]:
df = pd.DataFrame(data = np.random.randint(1,10,size = (10,3)),columns = ["Python","Java","Groovy"],index = list("ABCDEFGHIJ"))
df

In [None]:
df.rename(index = {'A':'X','B':'Y','C':'Z'},columns={'Python':'人工智能','Java':'大数据'})

In [None]:
df.replace({9:10})

In [None]:
df.replace({'Groovy':3},-3)

#### map映射元素

In [None]:
# map只能针对一列操作，即：Series。没有映射的值都会转为NaN
df['Python'].map({9:90,3:30})

In [2]:
df['Java'].map(lambda x:x ** 2)

NameError: name 'df' is not defined

In [None]:
def convert(x:int) -> int:
    return x ** 3
df.Groovy.map(convert)

#### apply映射元素转变

In [None]:
# apply既可以修改Series又可以修改DataFrame
# apply针对列进行修改
df.Python.apply(lambda x : x * 2)

In [None]:
# apply针对ataFrame进行修改
df.apply(lambda x : x * 2)

#### transform元素转变

In [None]:
# transform和apply用法类似
df.transform(lambda x : x * 3)

#### 重排随机抽样哑变量

In [None]:
# 返回乱序的随机数据
index = np.random.permutation(10)
index

In [None]:
# 重排，将索引顺序打乱
df.take(index)

In [None]:
# 随机抽样数据
index = np.random.permutation(10)

df.take(np.random.permutation(10)).tail(3)

In [None]:
# 哑变量数据准备
df = pd.DataFrame(data={'key':list('ababcbc')})
df

In [None]:
# 哑变量 - one-hot
pd.get_dummies(df,prefix='',prefix_sep='')

In [None]:
df = pd.DataFrame(data={"A":list('ababcbc'),"B":list('xyxyzyz')})
df

In [None]:
pd.get_dummies(df)

### 数据重塑

In [None]:
df

In [3]:
df.T

NameError: name 'df' is not defined

In [None]:
df2 = pd.DataFrame(data = np.random.randint(0,10,size = (30,3)),columns=['Python','Java','Groovy'],index = pd.MultiIndex.from_product([list("ABCDEGHIJK"),list("XYZ")]))
df2

In [None]:
# unstack将行索引转变为列索引，level表示层级，默认：-1表示最后一层，层次是从外往里数的
df2.unstack(level = 1)

In [None]:
# stack将列索引转变为行索引
df2.stack()

In [None]:
display(df2.unstack(level=1).stack(level = 0),df2.unstack(level=0).stack(level = 0))

In [None]:
df2.mean(axis=1,level=0)