In [1]:
import numpy as np
import pandas as pd

## 建立Series
1. Series支持 index + data
2. 两种方式：
    1. python dic
    2. python list [data] + python list [index]

In [2]:
s1 = pd.Series(data=np.random.normal(0,1,10))
s1

0   -0.056821
1   -0.051837
2   -1.133507
3   -0.868753
4    0.874161
5    0.106148
6    0.816449
7   -1.171192
8    0.603350
9   -0.072381
dtype: float64

1. Series.array -> ExtensionArray
2. Series.to_numpy() -> ndarray
3. Series.to_list() -> python list

In [3]:
print(type(s1.array))
print(type(s1.to_numpy()))
print(type(s1.to_list()))

<class 'pandas.core.arrays.numpy_.PandasArray'>
<class 'numpy.ndarray'>
<class 'list'>


In [4]:
data = np.zeros((2,),dtype=[('A','i4'),('B','f4'),('C','a10')])
data[:] = [(1,2.0,"Hello"),(2,3.0,"World")]
pd.DataFrame(data=data,index=['first','second'],columns=['C','B','A'])

Unnamed: 0,C,B,A
first,b'Hello',2.0,1
second,b'World',3.0,2


## 建立DataFrame
1. from_dict
2. from_records

In [5]:
pd.DataFrame.from_dict(
    dict([('A',[1,2,3]),('B',[4,5,6])]),
    orient='index',
    columns=['one','two','three']
)

Unnamed: 0,one,two,three
A,1,2,3
B,4,5,6


In [6]:
df2 = pd.DataFrame.from_dict(
    {'A':{1,2,3},'B':{2,3,4},'C':{3,4,5}},
    orient='index',
    columns=['one','two','three']
)

## DataFrame操作
1. 添加 insert()
2. 删除 pop()
3. 构造 assign() - assign函数会返回一个df的复制，不会值就修改df本身
4. 筛选 query()
5. 复制 copy()

In [7]:
df2['one']

A    1
B    2
C    3
Name: one, dtype: int64

In [8]:
df2['mul'] = df2['one'] * df2['two'] * df2['three']
df2

Unnamed: 0,one,two,three,mul
A,1,2,3,6
B,2,3,4,24
C,3,4,5,60


In [9]:
df2['flag'] = df2['mul'] > 20
df2

Unnamed: 0,one,two,three,mul,flag
A,1,2,3,6,False
B,2,3,4,24,True
C,3,4,5,60,True


In [10]:
df2.pop('mul')
df2

Unnamed: 0,one,two,three,flag
A,1,2,3,False
B,2,3,4,True
C,3,4,5,True


In [11]:
df2.insert(3,'new_mul',df2['one'][:2]*df2['two'][:2]*df2['three'][:2])
df2

Unnamed: 0,one,two,three,new_mul,flag
A,1,2,3,6.0,False
B,2,3,4,24.0,True
C,3,4,5,,True


## indexing
1. Series -col df[col]
2. Series -col df.loc[label]
3. Series -row df.iloc[loc]
4. Dataframe df[start:end]
5. Dataframe df[boolean_vector]

1. pandas 所有的行和列都可以是Dataframe的属性

## slicing
1. 默认为row
2. df.loc[label,column]

## 遍历并转换为json形式

In [12]:
data = [{'customer_id': 1, 'first_name':'John', 'last_name':'Doe', 'amount':100, 'sub_amount':50,'total': 150,'product':'tool box'},
        {'customer_id': 2, 'first_name':'John', 'last_name':'Doe', 'amount':50, 'sub_amount':50,'total': 100,'product':'light'},        
        {'customer_id': 3, 'first_name':'Jane', 'last_name':'Doe', 'amount':200, 'sub_amount':50,'total': 250,'product':'iron box'},
        {'customer_id': 4, 'first_name':'Jane', 'last_name':'Doe', 'amount':50, 'sub_amount':50,'total': 100,'product':'led'}    
        ]
df_data = pd.DataFrame(data)
df_data

Unnamed: 0,customer_id,first_name,last_name,amount,sub_amount,total,product
0,1,John,Doe,100,50,150,tool box
1,2,John,Doe,50,50,100,light
2,3,Jane,Doe,200,50,250,iron box
3,4,Jane,Doe,50,50,100,led


In [13]:
clients = {
           client['customer_id']:{**client} 
           for _,client in df_data.iterrows()
          }
clients

{1: {'customer_id': 1,
  'first_name': 'John',
  'last_name': 'Doe',
  'amount': 100,
  'sub_amount': 50,
  'total': 150,
  'product': 'tool box'},
 2: {'customer_id': 2,
  'first_name': 'John',
  'last_name': 'Doe',
  'amount': 50,
  'sub_amount': 50,
  'total': 100,
  'product': 'light'},
 3: {'customer_id': 3,
  'first_name': 'Jane',
  'last_name': 'Doe',
  'amount': 200,
  'sub_amount': 50,
  'total': 250,
  'product': 'iron box'},
 4: {'customer_id': 4,
  'first_name': 'Jane',
  'last_name': 'Doe',
  'amount': 50,
  'sub_amount': 50,
  'total': 100,
  'product': 'led'}}

In [20]:
from sklearn import datasets

def funs(data,pos):
    res_list = []
    for item in data:
        if item == pos:
            res_list.append(1)
        else:
            res_list.append(0)
    return res_list

# iris是一个dict结构，并且数据本身是numpy矩阵
iris = datasets.load_iris()
print(iris.keys())
print(iris['data'].shape)
print(iris['target'].shape)
iris_pd = pd.DataFrame(iris['data'], columns=iris['feature_names'])
print(iris['target_names'])
for index,name in enumerate(iris['target_names']):
    iris_pd = iris_pd.assign(name = funs(data=iris['target'],pos=index))
iris_pd

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])
(150, 4)
(150,)
['setosa' 'versicolor' 'virginica']


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),name
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,1
146,6.3,2.5,5.0,1.9,1
147,6.5,3.0,5.2,2.0,1
148,6.2,3.4,5.4,2.3,1
