# 5.1 了解元数据

![了解元数据](元数据.jpg)

In [1]:
import numpy as np

import pandas as pd
# display.[max_categories, max_columns, max_colwidth, max_info_columns,
# max_info_rows, max_rows, max_seq_items, memory_usage, min_rows, multi_sparse,
# notebook_repr_html, pprint_nest_depth, precision, show_dimensions]
pd.set_option('display.max_rows',10)
pd.set_option('display.max_columns',10)


In [2]:
# 记录数据所在根目录
data_source = r"Y:\BaiduNetdiskWorkspace\data_analysis\Python数据分析\data"

In [3]:
retail_data = pd.read_csv(data_source+"\\Online_Retail_Fake.csv")

In [4]:
# 处理retail_data['UnitPrice']的空值，给他们赋值为mean()
retail_data['UnitPrice'].fillna(retail_data['UnitPrice'].mean(),inplace=True)

In [5]:
# ['Quantity']中的空值用mode()众数填充
retail_data['Quantity'] = retail_data['Quantity'].fillna(retail_data['Quantity'].mode())

In [6]:
retail_data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010/12/1 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010/12/1 8:26,4.611117,17850.0,United Kingdom
2,536365,84406B,,8,2010/12/1 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010/12/1 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010/12/1 8:26,3.39,17850.0,United Kingdom


In [7]:
retail_data['Total_price'] = retail_data['Quantity']*retail_data['UnitPrice']

In [8]:
retail_data.shape

(541910, 9)

In [9]:
retail_data.size == retail_data.shape[0]*retail_data.shape[1]     # 说明size是数据总数

True

In [10]:
# 查看数据维度
retail_data.ndim

2

In [11]:
retail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541910 entries, 0 to 541909
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541910 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540454 non-null  object 
 3   Quantity     541910 non-null  int64  
 4   InvoiceDate  541910 non-null  object 
 5   UnitPrice    541910 non-null  float64
 6   CustomerID   406828 non-null  float64
 7   Country      541909 non-null  object 
 8   Total_price  541910 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 37.2+ MB


# 5.2 数据类型转换

![数据类型转换](数据类型转换.jpg)

对于一些只有几万行和十几万行的小型数据集，数据所占用的内存可能不是特别重要，但是当数据极大时，正确的数据类型就显得非常重要。
要完成这一工作，可以用以下代码：

In [12]:
retail_data.dtypes

InvoiceNo       object
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
UnitPrice      float64
CustomerID     float64
Country         object
Total_price    float64
dtype: object

In [13]:
# 对数据占用内存进行精确统计
mem = retail_data.memory_usage(deep=True)
# deep=True表示是否计算引用的对象的内存使用情况

In [14]:
mem

Index               128
InvoiceNo      34149621
StockCode      33645452
Description    45252183
Quantity        4335280
InvoiceDate    39006854
UnitPrice       4335280
CustomerID      4335280
Country        38137522
Total_price     4335280
dtype: int64

In [15]:
# 也可以用兆字节方式显示：
round(mem.sum()/(1024**2))

198

In [16]:
# 对各列数据所占用内存有所了解后，就需要根据需要来调整数据的类型。比如Country列
# 查看Country的unique数
print('Unique country: ',retail_data['Country'].nunique())
# 只有38个unique国家，我们可以把它转换成pandas的category变量减少存储空间
retail_data["Country"] = retail_data['Country'].astype('category')
new_mem = retail_data.memory_usage(deep=True)
new_mem

Unique country:  38


Index               128
InvoiceNo      34149621
StockCode      33645452
Description    45252183
Quantity        4335280
InvoiceDate    39006854
UnitPrice       4335280
CustomerID      4335280
Country          545458
Total_price     4335280
dtype: int64

In [17]:
round(new_mem['Country']/mem['Country'],3)  # 可见转换后的["country"]列只占转换前的1.4%

0.014

In [18]:
retail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541910 entries, 0 to 541909
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype   
---  ------       --------------   -----   
 0   InvoiceNo    541910 non-null  object  
 1   StockCode    541909 non-null  object  
 2   Description  540454 non-null  object  
 3   Quantity     541910 non-null  int64   
 4   InvoiceDate  541910 non-null  object  
 5   UnitPrice    541910 non-null  float64 
 6   CustomerID   406828 non-null  float64 
 7   Country      541909 non-null  category
 8   Total_price  541910 non-null  float64 
dtypes: category(1), float64(3), int64(1), object(4)
memory usage: 33.6+ MB


In [19]:
print("Unique CustomerID : " , retail_data['CustomerID'].nunique())
# CustomerID应该是INT类型，而不是Float类型。所以
retail_data['CustomerID'].fillna(0,inplace=True)
retail_data['CustomerID']= retail_data['CustomerID'].astype('int')
print(retail_data.memory_usage(deep = True))
retail_data.info()
Invo_mem = retail_data['InvoiceDate'].memory_usage(deep=True)

Unique CustomerID :  4372
Index               128
InvoiceNo      34149621
StockCode      33645452
Description    45252183
Quantity        4335280
InvoiceDate    39006854
UnitPrice       4335280
CustomerID      2167640
Country          545458
Total_price     4335280
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541910 entries, 0 to 541909
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype   
---  ------       --------------   -----   
 0   InvoiceNo    541910 non-null  object  
 1   StockCode    541909 non-null  object  
 2   Description  540454 non-null  object  
 3   Quantity     541910 non-null  int64   
 4   InvoiceDate  541910 non-null  object  
 5   UnitPrice    541910 non-null  float64 
 6   CustomerID   541910 non-null  int32   
 7   Country      541909 non-null  category
 8   Total_price  541910 non-null  float64 
dtypes: category(1), float64(2), int32(1), int64(1), object(4)
memory usage: 31.5+ MB


In [20]:
# 对InvoiceDate列转换应该是时间类型，可以使用.to_datetime()函数
retail_data['InvoiceDate'] = pd.to_datetime(retail_data['InvoiceDate'])
retail_data.info()
print(retail_data.memory_usage(deep=True))
new_Invo_mem = retail_data['InvoiceDate'].memory_usage(deep=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541910 entries, 0 to 541909
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541910 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540454 non-null  object        
 3   Quantity     541910 non-null  int64         
 4   InvoiceDate  541910 non-null  datetime64[ns]
 5   UnitPrice    541910 non-null  float64       
 6   CustomerID   541910 non-null  int32         
 7   Country      541909 non-null  category      
 8   Total_price  541910 non-null  float64       
dtypes: category(1), datetime64[ns](1), float64(2), int32(1), int64(1), object(3)
memory usage: 31.5+ MB
Index               128
InvoiceNo      34149621
StockCode      33645452
Description    45252183
Quantity        4335280
InvoiceDate     4335280
UnitPrice       4335280
CustomerID      2167640
Country          545458
Total_price     433

In [21]:
print(round(new_Invo_mem/Invo_mem,3)*100,'%')     # Invo_date转换后mem_usage降低到原来的11.1%

11.1 %


# 5.3 缺失数据与异常数据处理
## 5.3.1 缺失值与重复值

In [22]:
retail_data.isnull().sum()

InvoiceNo         0
StockCode         1
Description    1456
Quantity          0
InvoiceDate       0
UnitPrice         0
CustomerID        0
Country           1
Total_price       0
dtype: int64

In [23]:
retail_data.isnull()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Total_price
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,True,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
541905,False,False,False,False,False,False,False,False,False
541906,False,False,False,False,False,False,False,False,False
541907,False,False,False,False,False,False,False,False,False
541908,False,False,False,False,False,False,False,False,False


In [24]:
# 由于数据缺省值不一定是nan，也可能是0,等等，所以在读入数据的时候我们可以预先说明缺省值是什么。
retail_data = pd.read_csv(data_source+"\\\\Online_Retail_Fake.csv",na_values=[0,'Wrong booking'])
retail_data.isnull().sum()

InvoiceNo           0
StockCode           1
Description      1457
Quantity            0
InvoiceDate         0
UnitPrice        2518
CustomerID     135082
Country             1
dtype: int64

In [25]:
retail_data.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
541905    False
541906    False
541907    False
541908    False
541909    False
Length: 541910, dtype: bool

In [26]:
# 查看duplicated文件
retail_data[retail_data.duplicated()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
517,536409,21866,UNION JACK FLAG LUGGAGE TAG,1,2010/12/1 11:45,1.25,17908.0,United Kingdom
527,536409,22866,HAND WARMER SCOTTY DOG DESIGN,1,2010/12/1 11:45,2.10,17908.0,United Kingdom
537,536409,22900,SET 2 TEA TOWELS I LOVE LONDON,1,2010/12/1 11:45,2.95,17908.0,United Kingdom
539,536409,22111,SCOTTIE DOG HOT WATER BOTTLE,1,2010/12/1 11:45,4.95,17908.0,United Kingdom
555,536412,22327,ROUND SNACK BOXES SET OF 4 SKULLS,1,2010/12/1 11:49,2.95,17920.0,United Kingdom
...,...,...,...,...,...,...,...,...
541675,581538,22068,BLACK PIRATE TREASURE CHEST,1,2011/12/9 11:34,0.39,14446.0,United Kingdom
541689,581538,23318,BOX OF 6 MINI VINTAGE CRACKERS,1,2011/12/9 11:34,2.49,14446.0,United Kingdom
541692,581538,22992,REVOLVER WOODEN RULER,1,2011/12/9 11:34,1.95,14446.0,United Kingdom
541699,581538,22694,WICKER STAR,1,2011/12/9 11:34,2.10,14446.0,United Kingdom


In [27]:
retail_data[
    (retail_data['InvoiceNo']=='536409') & (retail_data['StockCode']=='21866')
]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
494,536409,21866,UNION JACK FLAG LUGGAGE TAG,1,2010/12/1 11:45,1.25,17908.0,United Kingdom
517,536409,21866,UNION JACK FLAG LUGGAGE TAG,1,2010/12/1 11:45,1.25,17908.0,United Kingdom


## 5.3.2 处理缺失数据
    首先利用pd.DataFrame()函数来构造一个包含缺失数据的DataFrame

In [28]:
df = pd.DataFrame(np.random.randint(0,100,15).reshape(5,3),
                 index=['a','b','c','d','e'],
                 columns= ['c1','c2','c3'])
df['c4'] = np.nan
df.loc['f'] = np.arange(10,14)
df.loc['g'] = np.nan
df['c5'] = np.nan
df['c4']['a'] = 18
df

Unnamed: 0,c1,c2,c3,c4,c5
a,72.0,88.0,57.0,18.0,
b,78.0,73.0,97.0,,
c,71.0,68.0,28.0,,
d,86.0,63.0,77.0,,
e,62.0,53.0,21.0,,
f,10.0,11.0,12.0,13.0,
g,,,,,


In [29]:
df.isnull()

Unnamed: 0,c1,c2,c3,c4,c5
a,False,False,False,False,True
b,False,False,False,True,True
c,False,False,False,True,True
d,False,False,False,True,True
e,False,False,False,True,True
f,False,False,False,False,True
g,True,True,True,True,True


In [30]:
df.notnull()

Unnamed: 0,c1,c2,c3,c4,c5
a,True,True,True,True,False
b,True,True,True,False,False
c,True,True,True,False,False
d,True,True,True,False,False
e,True,True,True,False,False
f,True,True,True,True,False
g,False,False,False,False,False


In [31]:
df.isnull().sum()

c1    1
c2    1
c3    1
c4    5
c5    7
dtype: int64

In [32]:
# 丢弃'c4'的空值
df['c4'].dropna(inplace=True)

In [33]:
# 丢弃都是空值的一整行
df.dropna(how='all')

Unnamed: 0,c1,c2,c3,c4,c5
a,72.0,88.0,57.0,18.0,
b,78.0,73.0,97.0,,
c,71.0,68.0,28.0,,
d,86.0,63.0,77.0,,
e,62.0,53.0,21.0,,
f,10.0,11.0,12.0,13.0,


In [34]:
# 丢弃空值的一整列
df.dropna(axis=1,how='all')

Unnamed: 0,c1,c2,c3,c4
a,72.0,88.0,57.0,18.0
b,78.0,73.0,97.0,
c,71.0,68.0,28.0,
d,86.0,63.0,77.0,
e,62.0,53.0,21.0,
f,10.0,11.0,12.0,13.0
g,,,,


## 5.3.3 Numpy 和Pandas对缺失数据的不同处理方式

In [35]:
# numpy：
a = np.array([np.nan,1,2,3,np.nan,4])
a

array([nan,  1.,  2.,  3., nan,  4.])

In [36]:
s = pd.Series(a)
s

0    NaN
1    1.0
2    2.0
3    3.0
4    NaN
5    4.0
dtype: float64

In [37]:
a.mean(),s.mean()

(nan, 2.5)

可见，numpy对于要处理的数据，只要有一个nan，就全部当做nan,而pandas则是略过nan，当做不存在

In [38]:
df2 = df.copy()

In [39]:
df2.loc['g'].c1 = 0
df2.loc['g'].c3 = 0

In [40]:
df2['c4']+1

a    19.0
b     NaN
c     NaN
d     NaN
e     NaN
f    14.0
g     NaN
Name: c4, dtype: float64

In [41]:
df2['c4']

a    18.0
b     NaN
c     NaN
d     NaN
e     NaN
f    13.0
g     NaN
Name: c4, dtype: float64

In [42]:
df2

Unnamed: 0,c1,c2,c3,c4,c5
a,72.0,88.0,57.0,18.0,
b,78.0,73.0,97.0,,
c,71.0,68.0,28.0,,
d,86.0,63.0,77.0,,
e,62.0,53.0,21.0,,
f,10.0,11.0,12.0,13.0,
g,0.0,,0.0,,


从上述代码可以看出，pandas对数据进行操作室只会对非nan进行操作。

## 5.3.4 填充缺失值
    在缺失值较少又不影响数据分析时，一般选择把他们丢弃，有时则考虑填充空值，比如fillno()函数

In [43]:
fv = pd.Series([1,2],index=['a','b'])
fv

a    1
b    2
dtype: int64

In [44]:
print(df['c5'].fillna(fv))             # 用fv填充空缺值

a    1.0
b    2.0
c    NaN
d    NaN
e    NaN
f    NaN
g    NaN
Name: c5, dtype: float64


除了以上几点，还可以用interpolate()插值函数进行填充。例如：


In [45]:
df['c4'].interpolate()        # 不考虑索引

a    18.0
b    17.0
c    16.0
d    15.0
e    14.0
f    13.0
g    13.0
Name: c4, dtype: float64

In [46]:
s = pd.Series([1,2,3,4,5,np.nan,20,30,np.nan,50,np.nan],
             index=list(range(len([1,2,3,4,5,np.nan,20,30,np.nan,50,np.nan]))))

In [47]:
s_new = s.interpolate()
s_new.values

array([ 1. ,  2. ,  3. ,  4. ,  5. , 12.5, 20. , 30. , 40. , 50. , 50. ])

In [48]:
# 如果考虑index补全
s_new2 = s.interpolate(method='index')
s_new2.values

array([ 1. ,  2. ,  3. ,  4. ,  5. , 12.5, 20. , 30. , 40. , 50. , 50. ])

 如果index是datetime的话，还可以考虑method='time'的差值补全。

# 5.4 处理重复数据

In [49]:
# 先创建数据
data = pd.DataFrame({
    'a':['x']*3+['y']*4,
    'b':[1,2,1,3,3,4,4]
})
data

Unnamed: 0,a,b
0,x,1
1,x,2
2,x,1
3,y,3
4,y,3
5,y,4
6,y,4


In [50]:
data.duplicated(subset='a'),data.duplicated(subset='b')  #分别考虑‘a’和‘b’列的重复情况

(0    False
 1     True
 2     True
 3    False
 4     True
 5     True
 6     True
 dtype: bool,
 0    False
 1    False
 2     True
 3    False
 4     True
 5    False
 6     True
 dtype: bool)

In [51]:
data.duplicated()                     # 同时考虑'a'和'b'列的重复情况

0    False
1    False
2     True
3    False
4     True
5    False
6     True
dtype: bool

In [52]:
data.drop_duplicates()

Unnamed: 0,a,b
0,x,1
1,x,2
3,y,3
5,y,4


In [53]:
# 当然，如果只需要检查其中的某几列，我们也可以给drop_duplicates()参数
data['c'] = np.arange(len(data['b']))

In [54]:
data.drop_duplicates(['a','b'])   # 与上面完全相同

Unnamed: 0,a,b,c
0,x,1,0
1,x,2,1
3,y,3,3
5,y,4,5


# 5.5 异常值

除了重复值和缺失值以外，我们还有可能遇到异常值。异常值的检测一方面来自常识，一方面来自数据字典给出的取值范围。此外，还可以根据两个标准差以外的数据需要特别进行观察，代码如下。

In [55]:
df = pd.DataFrame({
    'Data': np.random.normal(size=200)
})
df.head()

Unnamed: 0,Data
0,0.706636
1,0.271642
2,0.559154
3,-1.261083
4,-0.641297


In [56]:
# 获取df中数据与mean()的差在两个std以内的数据
df_condition = (np.abs(df['Data']-df['Data'].mean()))<=2*df['Data'].std()
df[df_condition]

Unnamed: 0,Data
0,0.706636
1,0.271642
2,0.559154
3,-1.261083
4,-0.641297
...,...
195,0.127060
196,1.123162
197,1.085555
198,0.353972


In [57]:
# 异常数据则是：
df_exception = (np.abs(df['Data']-df['Data'].mean()))>=2*df['Data'].std()
df[df_exception]

Unnamed: 0,Data
75,2.116474
111,-2.290828
113,-2.095221
122,-2.530811
124,3.207717
161,2.400229
180,2.186523
193,-2.425486


In [58]:
df[df_exception] = df['Data'].mean()
df[df_exception]

Unnamed: 0,Data
75,-0.068613
111,-0.068613
113,-0.068613
122,-0.068613
124,-0.068613
161,-0.068613
180,-0.068613
193,-0.068613


# 5.6 描述性统计
    假设所有的缺失数据，重复数据，异常值都已经处理完毕，则到了统计数据的下一步工作，就是对数据进行描述性统计，通过描述性统计可以对数据分布有更深入的了解。

In [59]:
# 利用describe()函数，默认给出数据计数、均值、标准差、最大值、最小值和第1，2，3分位数。
retail_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,541910.0,9.552237,218.080957,-80995.0,1.0,3.0,10.0,80995.0
UnitPrice,539392.0,4.632617,96.984836,-11062.06,1.25,2.08,4.13,38970.0
CustomerID,406828.0,15287.695176,1713.600085,12346.0,13953.0,15152.0,16791.0,18287.0


In [60]:
retail_data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010/12/1 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010/12/1 8:26,,17850.0,United Kingdom
2,536365,84406B,,8,2010/12/1 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010/12/1 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010/12/1 8:26,3.39,17850.0,United Kingdom


In [61]:
total_loc = retail_data.columns.get_loc('UnitPrice')+1
retail_data.insert(total_loc,'TotalPrice',retail_data['Quantity']*retail_data['UnitPrice'])

In [62]:
retail_data.isnull().sum()

InvoiceNo           0
StockCode           1
Description      1457
Quantity            0
InvoiceDate         0
UnitPrice        2518
TotalPrice       2518
CustomerID     135082
Country             1
dtype: int64

In [63]:
retail_data['StockCode'].fillna(-1,inplace=True)

In [73]:
retail_data['Description'].fillna(-1,method=None,inplace=True)

In [74]:
retail_data['UnitPrice'].fillna(retail_data['UnitPrice'].mean(),inplace=True)

In [75]:
retail_data['TotalPrice'].fillna(retail_data['UnitPrice']*retail_data['Quantity'],inplace=True)

In [76]:
retail_data['CustomerID'].fillna(-1,inplace=True)

In [77]:
retail_data.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
TotalPrice     0
CustomerID     0
Country        0
dtype: int64

In [78]:
retail_data['Country'].fillna('unknow',inplace=True)

In [79]:
retail_data

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,TotalPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010/12/1 8:26,2.550000,15.3000,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010/12/1 8:26,4.632617,27.7957,17850.0,United Kingdom
2,536365,84406B,-1,8,2010/12/1 8:26,2.750000,22.0000,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010/12/1 8:26,3.390000,20.3400,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010/12/1 8:26,3.390000,20.3400,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...,...
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011/12/9 12:50,2.100000,12.6000,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011/12/9 12:50,4.150000,16.6000,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011/12/9 12:50,4.150000,16.6000,12680.0,France
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011/12/9 12:50,4.950000,14.8500,12680.0,France


In [80]:
retail_data.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
TotalPrice     0
CustomerID     0
Country        0
dtype: int64

In [88]:
des = retail_data.describe().T
des.index

Index(['Quantity', 'UnitPrice', 'TotalPrice', 'CustomerID'], dtype='object')

In [91]:
des.drop(axis=0,index=['CustomerID'],inplace=True)

In [92]:
des

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,541910.0,9.552237,218.080957,-80995.0,1.0,3.0,10.0,80995.0
UnitPrice,541910.0,4.632617,96.759251,-11062.06,1.25,2.1,4.13,38970.0
TotalPrice,541910.0,16.839256,415.783603,-168469.6,3.48,9.78,17.4,168469.6


In [94]:
# 又是我们也需要查看一部分比较最大的值，比如：
retail_data.nlargest(20,'TotalPrice')
retail_data.nsmallest(20,'TotalPrice')

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,TotalPrice,CustomerID,Country
540422,C581484,23843,"PAPER CRAFT , LITTLE BIRDIE",-80995,2011/12/9 9:27,2.080000,-168469.600000,16446.0,United Kingdom
61624,C541433,23166,MEDIUM CERAMIC TOP STORAGE JAR,-74215,2011/1/18 10:17,1.040000,-77183.600000,12346.0,United Kingdom
225529,556690,23005,printing smudges/thrown away,-9600,2011/6/14 10:37,4.632617,-44473.119376,-1.0,United Kingdom
225530,556691,23005,printing smudges/thrown away,-9600,2011/6/14 10:37,4.632617,-44473.119376,-1.0,United Kingdom
225528,556687,23003,Printing smudges/thrown away,-9058,2011/6/14 10:36,4.632617,-41962.241178,-1.0,United Kingdom
...,...,...,...,...,...,...,...,...,...
15016,C537630,AMAZONFEE,AMAZON FEE,-1,2010/12/7 15:04,13541.330000,-13541.330000,-1.0,United Kingdom
16356,C537651,AMAZONFEE,AMAZON FEE,-1,2010/12/7 15:49,13541.330000,-13541.330000,-1.0,United Kingdom
16232,C537644,AMAZONFEE,AMAZON FEE,-1,2010/12/7 15:34,13474.790000,-13474.790000,-1.0,United Kingdom
375429,569466,23270,incorrect stock entry.,-2880,2011/10/4 11:42,4.632617,-13341.935813,-1.0,United Kingdom


In [100]:
# 除了上面这种方法外，还可以用sort_values方法来排序
retail_data.sort_values(by=['TotalPrice'],axis=0,ascending = False).head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,TotalPrice,CustomerID,Country
540421,581483,23843,"PAPER CRAFT , LITTLE BIRDIE",80995,2011/12/9 9:15,2.08,168469.6,16446.0,United Kingdom
61619,541431,23166,MEDIUM CERAMIC TOP STORAGE JAR,74215,2011/1/18 10:01,1.04,77183.6,12346.0,United Kingdom
502122,578841,84826,ASSTD DESIGN 3D PAPER STICKERS,12540,2011/11/25 15:57,4.632617,58093.012185,13256.0,United Kingdom
222680,556444,22502,PICNIC BASKET WICKER 60 PIECES,60,2011/6/10 15:28,649.5,38970.0,15098.0,United Kingdom
74614,542504,37413,-1,5568,2011/1/28 12:03,4.632617,25794.409238,-1.0,United Kingdom
