# Pandas

In [2]:
from pandas import Series, DataFrame
import pandas as pd  # ＃将pandas库和别名导入为pd

## pandas.Series
>`Series`是一种类似于一维数组的对象，是由一组数据(各种*`NumPy`*数据类型)以及一组与之相关的数据标签(即索引)组成。仅由一组数据也可产生简单的`Series`对象。

*Pandas* 系列可以使用以下构造函数创建 -

    pandas.Series( data, index, dtype, copy)

Python构造函数的参数如下 -

| 编号 | 参数  | 描述                                                         |
| :---: | :--------- | :---------------------------------------------------------- |
| 1    | `data`  | 数据采取各种形式，如：`ndarray`，`list`，`constants`               |
| 2    | `index`| 索引值必须是唯一的和散列的，与数据的长度相同。 默认`np.arange(n)`如果没有索引被传递。 |
| 3    | `dtype` | `dtype`用于数据类型。如果没有，将推断数据类型                  |
| 4    | `copy`  | 复制数据，默认为`false`。|

可以使用各种输入创建一个系列，如 
+ 列表等可迭代对象
+ ndarray数组对象
+ 字典对象
+ 标量值或常数 

### 从ndarray创建一个系列
如果数据是`ndarray`，则传递的索引必须具有相同的长度。 

如果没有传递索引值，那么默认的索引将是范围(`n`)，其中`n`是数组长度，即`[0,1,2,3…. range(len(array))-1] - 1]`。

In [3]:
obj = Series([1, -2, 3, -4])  # 这里没有传递任何索引，因此默认情况下，它分配了从0到len(data)-1的索引，即：0到3。
print(obj)

0    1
1   -2
2    3
3   -4
dtype: int64


In [4]:
obj2 = Series([1, -2, 3, -4], index=['a', 'b', 'c', 'd'])  # 在这里传递了索引值。现在可以在输出中看到自定义的索引值。
print(obj2)

a    1
b   -2
c    3
d   -4
dtype: int64


`Series`对象本质上由两个数组构成，一个构成索引`index`，一个构成对象的值`values`

In [5]:
obj2.values  # 查看值

array([ 1, -2,  3, -4], dtype=int64)

In [6]:
obj2.index  # 查看索引

Index(['a', 'b', 'c', 'd'], dtype='object')

### 使用标签检索数据(索引)
一个系列就像一个固定大小的字典，可以通过索引标签获取和设置值。

In [7]:
obj2['b']  # 使用索引标签值检索单个元素。

-2

In [8]:
obj2['c'] = 23  # 将 23 赋值给 obj2 中索引标签为 ‘c’ 的元素
obj2[['c', 'd']]  # 使用索引标签值列表检索多个元素。

c    23
d    -4
dtype: int64

In [9]:
obj2

a     1
b    -2
c    23
d    -4
dtype: int64

`Series`可以使用布尔数组作为索引，`s[布尔数组]`，可以将`true`对应的元素留下，其余去掉
>使用的时候，与或非的规则跟numpy一致

In [10]:
obj2[obj2 < 0]  # 显示 obj2 中小于 2 的元素

b   -2
d   -4
dtype: int64

`Series`类型也支持矢量化运算与广播操作。计算规则与*`Numpy`*数组的规则相同。

四则运算时会根据`index`的值对相应的数据进行运算，结果是浮点数

In [11]:
obj2 * 2  # 元素值 x2

a     2
b    -4
c    46
d    -8
dtype: int64

*`Numpy`*的一些函数，也适用于`Series`类型，例如，`np.mean`，`np.sum`等。

In [12]:
import numpy as np

In [13]:
np.abs(obj2)  # 使用 numpy.abs 获取 obj2 各元素值的绝对值

a     1
b     2
c    23
d     4
dtype: int64

### 从字典创建一个系列
字典(`dict`)可以作为输入传递，如果没有指定索引，则按排序顺序取得字典键以构造索引。 如果传递了索引，索引中与标签对应的数据中的值将被拉出。
> 注意 - 字典键用于构建索引。

In [14]:
data = {'张三': 92, '李四': 78, '王五': 68, '小明': 82}

In [15]:
obj3 = Series(data)  # 这里没有传递任何索引，按排序顺序取得字典键以构造索引。
obj3

张三    92
李四    78
王五    68
小明    82
dtype: int64

In [16]:
names = ['张三', '李四', '王五', '小明']
obj4 = Series(data, index=names)  # 这里传递了索引值，索引中与标签对应的数据中的值将被拉出。
obj4

张三    92
李四    78
王五    68
小明    82
dtype: int64

In [17]:
obj4.name = 'math'
obj4.index.name = 'students'

In [18]:
obj4

students
张三    92
李四    78
王五    68
小明    82
Name: math, dtype: int64

## dataframe

In [19]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd

In [20]:
data = {
    'name': ['张三', '李四', '王五', '小明'],
    'sex': ['female', 'female', 'male', 'male'],
    'year': [2001, 2001, 2003, 2002],
    'city': ['北京', '上海', '广州', '北京']
}
df = DataFrame(data)
df

Unnamed: 0,name,sex,year,city
0,张三,female,2001,北京
1,李四,female,2001,上海
2,王五,male,2003,广州
3,小明,male,2002,北京


In [21]:
df = DataFrame(data, columns=['name', 'sex', 'year', 'city'])
df

Unnamed: 0,name,sex,year,city
0,张三,female,2001,北京
1,李四,female,2001,上海
2,王五,male,2003,广州
3,小明,male,2002,北京


In [22]:
df = DataFrame(data,
               columns=['name', 'sex', 'year', 'city'],
               index=['a', 'b', 'c', 'd'])
df

Unnamed: 0,name,sex,year,city
a,张三,female,2001,北京
b,李四,female,2001,上海
c,王五,male,2003,广州
d,小明,male,2002,北京


In [23]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [24]:
df.columns

Index(['name', 'sex', 'year', 'city'], dtype='object')

In [25]:
data2 = {
    'sex': {
        '张三': 'female',
        '李四': 'female',
        '王五': 'male'
    },
    'city': {
        '张三': '北京',
        '李四': '上海',
        '王五': '广州'
    }
}
df2 = DataFrame(data2)
df2

Unnamed: 0,sex,city
张三,female,北京
李四,female,上海
王五,male,广州


In [26]:
df.index.name = 'id'
df.columns.name = 'std_info'

In [27]:
df

std_info,name,sex,year,city
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,张三,female,2001,北京
b,李四,female,2001,上海
c,王五,male,2003,广州
d,小明,male,2002,北京


In [28]:
obj = Series([1, -2, 3, -4], index=['a', 'b', 'c', 'd'])
obj

a    1
b   -2
c    3
d   -4
dtype: int64

In [29]:
obj.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [30]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object', name='id')

In [31]:
df.columns

Index(['name', 'sex', 'year', 'city'], dtype='object', name='std_info')

In [32]:
index = obj.index
index[1] = 'f'

TypeError: Index does not support mutable operations

In [None]:
df

In [None]:
'sex' in df.columns

In [None]:
'f' in df.index

In [None]:
obj = Series([1, -2, 3, -4], index=['b', 'a', 'c', 'd'])
obj

In [None]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

In [None]:
obj = Series([1, -2, 3, -4], index=[0, 2, 3, 5])
obj

In [None]:
obj2 = obj.reindex(range(6), method='ffill')
obj2

In [None]:
df = DataFrame(np.arange(9).reshape(3, 3),
               index=['a', 'c', 'd'],
               columns=['name', 'id', 'sex'])
df

In [None]:
df2 = df.reindex(['a', 'b', 'c', 'd'])
df2

In [None]:
df3 = df.reindex(columns=['name', 'year', 'id'], fill_value=0)
df3

In [None]:
data = {'name': ['张三', '李四', '王五', '小明'], 'grade': [68, 78, 63, 92]}
df = DataFrame(data)
df

In [None]:
df2 = df.sort_values(by='grade')
df2

In [None]:
df3 = df2.reset_index()
df3

In [None]:
df4 = df2.reset_index(drop=True)
df4

In [None]:
data = {
    'name': ['张三', '李四', '王五', '小明'],
    'sex': ['female', 'female', 'male', 'male'],
    'year': [2001, 2001, 2003, 2002],
    'city': ['北京', '上海', '广州', '北京']
}
df = DataFrame(data)
df

In [None]:
df2 = df.set_index('name')
df2

In [None]:
df3 = df2.reset_index()
df3

## 索引和选取

In [None]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd

In [None]:
obj = Series([1, -2, 3, -4], index=['a', 'b', 'c', 'd'])
obj

In [None]:
obj[1]

In [None]:
obj['b']

In [None]:
obj[['a', 'c']]

In [None]:
obj[0:2]

In [None]:
obj['a':'c']

In [None]:
data = {
    'name': ['张三', '李四', '王五', '小明'],
    'sex': ['female', 'female', 'male', 'male'],
    'year': [2001, 2001, 2003, 2002],
    'city': ['北京', '上海', '广州', '北京']
}
df = DataFrame(data)
df

In [None]:
df['city']

In [None]:
df.name

In [None]:
df[['city', 'sex']]

In [None]:
df2 = df.set_index('name')
df2

In [None]:
df2[0:2]

In [None]:
df2['李四':'王五']

In [None]:
df2

In [None]:
df2.loc['张三']

In [None]:
df2.loc[['张三', '王五']]

In [None]:
df2.iloc[1]

In [None]:
df2.iloc[[1, 3]]

In [None]:
df2.ix[['张三', '王五'], 0:2]

In [None]:
pd.set_option('mode.chained_assignment', None)

In [None]:
df2.ix[:, ['sex', 'year']]  # 获取列

In [None]:
df2.ix[[1, 3], :]  # 获取行

In [None]:
df2['sex'] == 'female'

In [None]:
df2[df2['sex'] == 'female']

In [None]:
df2[(df2['sex'] == 'female') & (df2['city'] == '北京')]

## 行和列的操作

In [None]:
df

In [None]:
new_data = {'city': '武汉', 'name': '小李', 'sex': 'male', 'year': 2002}

In [None]:
df = df.append(new_data, ignore_index=True)  # 忽略索引值
df

In [None]:
df['class'] = 2018
df

In [None]:
df['math'] = [92, 78, 58, 69, 82]
df

In [None]:
new_df = df.drop(2)  # 删除行
new_df

In [None]:
new_df = new_df.drop('class', axis=1)  # 删除列
new_df

In [None]:
new_df.rename(index={
    3: 2,
    4: 3
}, columns={'math': 'Math'}, inplace=True)  # inplace可在原数据上修改
new_df

In [None]:
obj1 = Series([3.2, 5.3, -4.4, -3.7], index=['a', 'c', 'g', 'f'])
obj1

In [None]:
obj2 = Series([5.0, -2, 4.4, 3.4], index=['a', 'b', 'c', 'd'])
obj2

In [None]:
obj1 + obj2

In [None]:
df1 = DataFrame(np.arange(9).reshape(3, 3),
                columns=['a', 'b', 'c'],
                index=['apple', 'tea', 'banana'])
df1

In [None]:
df2 = DataFrame(np.arange(9).reshape(3, 3),
                columns=['a', 'b', 'd'],
                index=['apple', 'tea', 'coco'])
df2

In [None]:
df1 + df2

In [None]:
df1

In [None]:
s = df1.ix['apple']
s

In [None]:
df1 - s

In [None]:
data = {
    'fruit': ['apple', 'orange', 'grape', 'banana'],
    'price': ['25元', '42元', '35元', '14元']
}
df1 = DataFrame(data)
df1

In [None]:
def f(x):
    return x.split('元')[0]


df1['price'] = df1['price'].map(f)
df1

In [None]:
df2 = DataFrame(np.random.randn(3, 3),
                columns=['a', 'b', 'c'],
                index=['app', 'win', 'mac'])
df2

In [None]:
def f(x):
    return x.max() - x.min()


df2.apply(f)

In [None]:
df2

In [None]:
df2.applymap(lambda x: '%.2f' % x)

In [None]:
obj1 = Series([-2, 3, 2, 1], index=['b', 'a', 'd', 'c'])
obj1

In [None]:
obj1.sort_index()  # 升序

In [None]:
obj1.sort_index(ascending=False)  # 降序

In [None]:
obj1.sort_values()

In [None]:
df2

In [None]:
df2.sort_values(by='b')

In [None]:
df = DataFrame(np.random.randn(9).reshape(3, 3), columns=['a', 'b', 'c'])
df

In [None]:
df.sum()

In [None]:
df.sum(axis=1)

In [None]:
data = {
    'name': ['张三', '李四', '王五', '小明'],
    'sex': ['female', 'female', 'male', 'male'],
    'math': [78, 79, 83, 92],
    'city': ['北京', '上海', '广州', '北京']
}
df = DataFrame(data)
df

In [None]:
df.describe()

In [None]:
obj = Series(['a', 'b', 'a', 'c', 'b'])
obj

In [None]:
obj.unique()

In [None]:
obj.value_counts()

In [None]:
obj = Series(np.random.randn(9),
             index=[[
                 'one', 'one', 'one', 'two', 'two', 'two', 'three', 'three',
                 'three'
             ], ['a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c']])
obj

In [None]:
obj.index

In [None]:
obj['two']

In [None]:
obj[:, 'a']  # 内层选取

In [None]:
df = DataFrame(np.arange(16).reshape(4, 4),
               index=[['one', 'one', 'two', 'two'], ['a', 'b', 'a', 'b']],
               columns=[['apple', 'apple', 'orange', 'orange'],
                        ['red', 'green', 'red', 'green']])
df

In [None]:
df['apple']

In [None]:
df.swaplevel(0, 1)

In [None]:
df.sum(level=0)

In [None]:
df.sum(level=1, axis=1)

## pandas数据可视化

In [None]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt  # 导入matplotlib库
%matplotlib inline  # 魔法函数

In [None]:
s = Series(np.random.normal(size=10))
s

In [None]:
s.plot()

In [None]:
df = DataFrame({
    'normal': np.random.normal(size=100),
    'gamma': np.random.gamma(1, size=100),
    'poisson': np.random.poisson(size=100)
})
df.cumsum()

In [None]:
df.cumsum().plot()

In [None]:
data = {
    'name': ['张三', '李四', '王五', '小明', 'Peter'],
    'sex': ['female', 'female', 'male', 'male', 'male'],
    'year': [2001, 2001, 2003, 2002, 2002],
    'city': ['北京', '上海', '广州', '北京', '北京']
}
df = DataFrame(data)
df

In [None]:
df['sex'].value_counts()

In [None]:
df['sex'].value_counts().plot(kind='bar')

In [None]:
df2 = DataFrame(np.random.randint(0, 100, size=(3, 3)),
                index=('one', 'two', 'three'),
                columns=['A', 'B', 'C'])
df2

In [None]:
df2.plot(kind='barh')

In [None]:
df2.plot(kind='barh', stacked=True, alpha=0.5)

In [None]:
s = Series(np.random.normal(size=100))
s.hist(bins=20, grid=False)

In [None]:
s.plot(kind='kde')

In [None]:
df3 = DataFrame(np.arange(10), columns=['X'])
df3['Y'] = 2 * df3['X'] + 5
df3

In [None]:
df3.plot(kind='scatter', x='X', y='Y')

In [None]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
import seaborn as sns  # 导入seaborn库

In [None]:
tips = sns.load_dataset('tips')
tips.head()

In [None]:
tips.shape

In [None]:
tips.describe()

In [None]:
tips.info()

In [None]:
tips.plot(kind='scatter', x='total_bill', y='tip')

In [None]:
male_tip = tips[tips['sex'] == 'Male']['tip'].mean()
male_tip

In [None]:
female_tip = tips[tips['sex'] == 'Female']['tip'].mean()
female_tip

In [None]:
s = Series([male_tip, female_tip], index=['male', 'female'])
s

In [None]:
s.plot(kind='bar')

In [None]:
tips['day'].unique()

In [None]:
sun_tip = tips[tips['day'] == 'Sun']['tip'].mean()
sat_tip = tips[tips['day'] == 'Sat']['tip'].mean()
thur_tip = tips[tips['day'] == 'Thur']['tip'].mean()
fri_tip = tips[tips['day'] == 'Fri']['tip'].mean()

In [None]:
s = Series([thur_tip, fri_tip, sat_tip, sun_tip],
           index=['Thur', 'Fri', 'Sat', 'Sun'])
s

In [None]:
s.plot(kind='bar')

In [None]:
tips['percent_tip'] = tips['tip'] / (tips['total_bill'] + tips['tip'])
tips.head(10)

In [None]:
tips['percent_tip'].hist(bins=50)