In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Объект Series

In [2]:
obj = Series([4,7,-5,3])
obj2 = Series([4, 7, -5, 3], index=['d', 'Ь', 'а', 'с'])

In [3]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
obj2

d    4
Ь    7
а   -5
с    3
dtype: int64

**Для выборки одного или нескольких элементов из объекта Series можно использовать значения индекса**

In [5]:
obj = Series([4,7,-5,3])
obj2[[1, 2, 3]]

  obj2[[1, 2, 3]]


Ь    7
а   -5
с    3
dtype: int64

**C Series'ами можно производить фильтрацию с помощью булева массива, скалярное умножение и применение математических функций.**


In [6]:
obj2[obj2 > 0]

d    4
Ь    7
с    3
dtype: int64

In [7]:
obj2 * 2

d     8
Ь    14
а   -10
с     6
dtype: int64

In [8]:
np.exp(obj)

0      54.598150
1    1096.633158
2       0.006738
3      20.085537
dtype: float64

**Объект Series можно также представлять себе как упорядоченный словарь фиксированной длины, поскольку он отображает индекс на данные. Его можно передавать многим функциям, ожидающим получить словарь.**

In [9]:
sdata = { 'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

### Полезные методы Series

```
obj_example = Series(data)
obj_example.isnull() 
obj_example.notnull()
```

# Объект DataFrame

In [10]:
data = {"city": ["Барнаул","Новокузнецк","Онталия", "Благовещенск","Якутск"],
		"year": [2000, 2001, 2002,2001, 2002],
		"pop": [1.5, 1.7, 3.6, 2.4, 2.9]}
data_frame = pd.DataFrame(data)
data_frame

Unnamed: 0,city,year,pop
0,Барнаул,2000,1.5
1,Новокузнецк,2001,1.7
2,Онталия,2002,3.6
3,Благовещенск,2001,2.4
4,Якутск,2002,2.9


In [11]:
data_frame["city"]

0         Барнаул
1     Новокузнецк
2         Онталия
3    Благовещенск
4          Якутск
Name: city, dtype: object

**Если передать конструктору DataFrame словарь словарей , то ключи внешнего словаря будут интерпретированы как столбцы, а ключи внутреннего словаря - как индексы строк.**

In [12]:
data = {"nevada": {2001:2.4, 2002: 2.9}, "ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6}}
data_frame = DataFrame(data)
data_frame

Unnamed: 0,nevada,ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


**Результат можно транспонировать**

In [13]:
data_frame.T

Unnamed: 0,2001,2002,2000
nevada,2.4,2.9,
ohio,1.7,3.6,1.5


## Что можно передать в DataFrame


| Тип                                    | Примечания                                                                                                            |
| -------------------------------------- | --------------------------------------------------------------------------------------------------------------------- |
| Двумерный ndarray                      | Матрица данных, дополнительно можно передать метки строк и стольцов                                                   |
| Словарь массивов, списков или кортежей | Каждая последовательность становиться столбцом объекта DataFrame. Все последовательности должны быть одинаковой длины |
| Словарь объектов Series                | Каждое значение становится столбцом. Ключ объединяются и образуют индексы строк результата                            |
| Словарь словарей                       | Каждый внутренний словарь становится столбцом. Ключи - индексы строк                                                  |
| Список словарей или список Series      | Каждый элемент списка становится строкой объекта DataFrame.                                                           |


# Базовая функциональность

## Переиндексация DataFrame

In [14]:
data_frame = DataFrame([[1,2,3],[4,5,6],[7,8,9]], index =['a','c','d'],columns=["Абхазия","Испания","Юар"])
data_frame

Unnamed: 0,Абхазия,Испания,Юар
a,1,2,3
c,4,5,6
d,7,8,9


In [15]:
data_frame_2 = data_frame.reindex(['a','b','c','d'])
data_frame_2

Unnamed: 0,Абхазия,Испания,Юар
a,1.0,2.0,3.0
b,,,
c,4.0,5.0,6.0
d,7.0,8.0,9.0


In [16]:
data_frame_3 = data_frame.reindex(columns=["Абхазия", "Ispania","Юар"])
data_frame_3

Unnamed: 0,Абхазия,Ispania,Юар
a,1,,3
c,4,,6
d,7,,9


## Удаление элементов из DataFrame

In [17]:
data = {"city": ["Барнаул","Новокузнецк","Онталия", "Благовещенск","Якутск"],
		"year": [2000, 2001, 2002,2001, 2002],
		"pop": [1.5, 1.7, 3.6, 2.4, 2.9]}
data_frame = pd.DataFrame(data)
data_frame.drop([0,3])

Unnamed: 0,city,year,pop
1,Новокузнецк,2001,1.7
2,Онталия,2002,3.6
4,Якутск,2002,2.9


In [18]:
data = {"city": ["Барнаул","Новокузнецк","Онталия", "Благовещенск","Якутск"],
		"year": [2000, 2001, 2002,2001, 2002],
		"pop": [1.5, 1.7, 3.6, 2.4, 2.9]}
data_frame = pd.DataFrame(data)
data_frame.drop(["year"], axis=1)

Unnamed: 0,city,pop
0,Барнаул,1.5
1,Новокузнецк,1.7
2,Онталия,3.6
3,Благовещенск,2.4
4,Якутск,2.9


## Доступ по индексу

In [19]:
data = {"city": ["Барнаул","Новокузнецк","Онталия", "Благовещенск","Якутск"],
		"year": [2000, 2001, 2002,2001, 2002],
		"pop": [1.5, 1.7, 3.6, 2.4, 2.9]}
data_frame = pd.DataFrame(data)
data_frame["city"]

0         Барнаул
1     Новокузнецк
2         Онталия
3    Благовещенск
4          Якутск
Name: city, dtype: object

In [20]:
data_frame[:2]

Unnamed: 0,city,year,pop
0,Барнаул,2000,1.5
1,Новокузнецк,2001,1.7


In [21]:
data_frame[data_frame["year"]>2001]

Unnamed: 0,city,year,pop
2,Онталия,2002,3.6
4,Якутск,2002,2.9


## Булев DataFrame

In [22]:
data_frame = DataFrame(np.arange(16).reshape(4,4))
data_frame

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [23]:
data_frame < 5

Unnamed: 0,0,1,2,3
0,True,True,True,True
1,True,False,False,False
2,False,False,False,False
3,False,False,False,False


In [24]:
data_frame[data_frame < 5] = 0
data_frame

Unnamed: 0,0,1,2,3
0,0,0,0,0
1,0,5,6,7
2,8,9,10,11
3,12,13,14,15


## Арифметические операции

In [25]:
df_1 = DataFrame(np.arange(9.) . reshape((3, 3)), columns=list('bcd'),index = ['A1', 'B2', 'C3'])
df_2 = DataFrame(np.arange(12.) . reshape((4, 3)), columns=list('bde'),index = ['A1', 'B2', 'C3','C4'])

In [26]:
df_1

Unnamed: 0,b,c,d
A1,0.0,1.0,2.0
B2,3.0,4.0,5.0
C3,6.0,7.0,8.0


In [27]:
df_2

Unnamed: 0,b,d,e
A1,0.0,1.0,2.0
B2,3.0,4.0,5.0
C3,6.0,7.0,8.0
C4,9.0,10.0,11.0


In [28]:
df_1.add(df_2)

Unnamed: 0,b,c,d,e
A1,0.0,,3.0,
B2,6.0,,9.0,
C3,12.0,,15.0,
C4,,,,


In [29]:
df_1.add(df_2,fill_value=0)

Unnamed: 0,b,c,d,e
A1,0.0,1.0,3.0,2.0
B2,6.0,4.0,9.0,5.0
C3,12.0,7.0,15.0,8.0
C4,9.0,,10.0,11.0


## Применение функций

In [30]:
df_1 = DataFrame(np.random.randn(4,3), columns=list('bcd'),index = ['A1', 'B2', 'C3','C4'])
df_1

Unnamed: 0,b,c,d
A1,-0.247462,-0.317234,-0.585552
B2,-0.038557,0.644532,0.056086
C3,-0.757075,-0.165046,0.833005
C4,0.276548,1.288583,0.032086


In [31]:
df_1.abs()

Unnamed: 0,b,c,d
A1,0.247462,0.317234,0.585552
B2,0.038557,0.644532,0.056086
C3,0.757075,0.165046,0.833005
C4,0.276548,1.288583,0.032086


In [32]:
def func_1(x):
    return x.max() - x.min()
df_1.apply(func_1, axis=1)

A1    0.338091
B2    0.683089
C3    1.590080
C4    1.256496
dtype: float64

In [33]:
df_1.apply(func_1)

b    1.033623
c    1.605816
d    1.418557
dtype: float64

In [34]:
def func_2(x):
    return x + 2
df_1

Unnamed: 0,b,c,d
A1,-0.247462,-0.317234,-0.585552
B2,-0.038557,0.644532,0.056086
C3,-0.757075,-0.165046,0.833005
C4,0.276548,1.288583,0.032086


In [35]:
df_1.map(func_2)

Unnamed: 0,b,c,d
A1,1.752538,1.682766,1.414448
B2,1.961443,2.644532,2.056086
C3,1.242925,1.834954,2.833005
C4,2.276548,3.288583,2.032086


## Сортировка

In [36]:
df_1 = DataFrame(np.random.randn(4,3), columns=list('bcd'),index = ['A1', 'B2', 'C3','C4'])
df_1

Unnamed: 0,b,c,d
A1,0.368069,0.574571,1.551999
B2,-0.359148,0.881226,-1.288258
C3,0.355592,-1.976377,-0.530534
C4,0.420589,0.641768,1.557816


In [37]:
df_1.sort_index()

Unnamed: 0,b,c,d
A1,0.368069,0.574571,1.551999
B2,-0.359148,0.881226,-1.288258
C3,0.355592,-1.976377,-0.530534
C4,0.420589,0.641768,1.557816


In [38]:
df_1.sort_index(axis=1)

Unnamed: 0,b,c,d
A1,0.368069,0.574571,1.551999
B2,-0.359148,0.881226,-1.288258
C3,0.355592,-1.976377,-0.530534
C4,0.420589,0.641768,1.557816


In [39]:
df_1.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b
A1,1.551999,0.574571,0.368069
B2,-1.288258,0.881226,-0.359148
C3,-0.530534,-1.976377,0.355592
C4,1.557816,0.641768,0.420589


In [40]:
df = DataFrame({'b': [4,7,-3,2] , 'a': [0,1,0,1]})
df

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [41]:
df.sort_values(['a','b'],ascending=(False,False))

Unnamed: 0,b,a
1,7,1
3,2,1
0,4,0
2,-3,0


## Уникальные значения, счетчики значений и членство

In [42]:
df = DataFrame({'b': [4,7,-3,2] , 'a': [0,1,0,1]})
df

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [43]:
df["a"].unique

<bound method Series.unique of 0    0
1    1
2    0
3    1
Name: a, dtype: int64>

In [44]:
df

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [45]:
df["a"].value_counts()

a
0    2
1    2
Name: count, dtype: int64

In [46]:
df

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [47]:
df["a"].isin([0,2])

0     True
1    False
2     True
3    False
Name: a, dtype: bool

In [48]:
df[df["a"].isin([0,2])]

Unnamed: 0,b,a
0,4,0
2,-3,0


In [49]:
df[~df["a"].isin([0,2])]

Unnamed: 0,b,a
1,7,1
3,2,1


# Обработка и фильтрация отсутствующих данных

## Фильтрация

In [50]:
df = DataFrame({"a": [1,2,None,None],"b":[0,0,0,0],"c":[np.nan, "Yes", "No","Yes"]})
df

Unnamed: 0,a,b,c
0,1.0,0,
1,2.0,0,Yes
2,,0,No
3,,0,Yes


In [51]:
df["c"].isnull()

0     True
1    False
2    False
3    False
Name: c, dtype: bool

In [52]:
df["c"].notnull()

0    False
1     True
2     True
3     True
Name: c, dtype: bool

In [53]:
df

Unnamed: 0,a,b,c
0,1.0,0,
1,2.0,0,Yes
2,,0,No
3,,0,Yes


In [54]:
df["a"].dropna()

0    1.0
1    2.0
Name: a, dtype: float64

In [55]:
df.dropna()

Unnamed: 0,a,b,c
1,2.0,0,Yes


In [63]:
df = DataFrame(np.random.randn( 7, 3))
df.loc[:4, 1] = None
df.loc[:2, 2] = None
df

Unnamed: 0,0,1,2
0,0.798647,,
1,0.014109,,
2,1.202656,,
3,0.956931,,2.179283
4,0.7375,,-0.6636
5,0.414271,1.456179,-1.207857
6,0.063788,0.252981,0.94138


In [66]:
df.dropna(thresh=3)

Unnamed: 0,0,1,2
5,0.414271,1.456179,-1.207857
6,0.063788,0.252981,0.94138


## Заполнение пропусков

In [70]:
df = DataFrame(np.random.randn( 7, 3))
df.loc[:4, 1] = None
df.loc[:2, 2] = None
df

Unnamed: 0,0,1,2
0,1.267031,,
1,-0.119904,,
2,0.975804,,
3,1.02951,,1.010862
4,-0.023218,,0.334179
5,-0.771362,0.365484,0.442316
6,0.305522,0.845112,0.328449


In [68]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.798647,0.0,0.0
1,0.014109,0.0,0.0
2,1.202656,0.0,0.0
3,0.956931,0.0,2.179283
4,0.7375,0.0,-0.6636
5,0.414271,1.456179,-1.207857
6,0.063788,0.252981,0.94138


In [71]:
df.fillna({1: -1, 2:1})

Unnamed: 0,0,1,2
0,1.267031,-1.0,1.0
1,-0.119904,-1.0,1.0
2,0.975804,-1.0,1.0
3,1.02951,-1.0,1.010862
4,-0.023218,-1.0,0.334179
5,-0.771362,0.365484,0.442316
6,0.305522,0.845112,0.328449


# Объединение данных merge и concat

In [72]:
df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": [1, 2, 3, 4]})
df1

Unnamed: 0,key,value
0,A,1
1,B,2
2,C,3
3,D,4


In [73]:
df2 = pd.DataFrame({"key": ["B", "D", "E", "F"], "value": [5, 6, 7, 8]})
df2

Unnamed: 0,key,value
0,B,5
1,D,6
2,E,7
3,F,8


In [74]:
pd.merge(df1, df2, on="key", how="inner")

Unnamed: 0,key,value_x,value_y
0,B,2,5
1,D,4,6


In [75]:
pd.concat([df1, df2], axis=0, ignore_index=True)

Unnamed: 0,key,value
0,A,1
1,B,2
2,C,3
3,D,4
4,B,5
5,D,6
6,E,7
7,F,8


In [76]:
df1.join(df2, how="inner", lsuffix="_1", rsuffix="_2")

Unnamed: 0,key_1,value_1,key_2,value_2
0,A,1,B,5
1,B,2,D,6
2,C,3,E,7
3,D,4,F,8


# Агрегация данных

## Groupby

In [77]:
df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
                              'Parrot', 'Parrot'],
                   'Max Speed': [380., 370., 24., 26.]})
df

Unnamed: 0,Animal,Max Speed
0,Falcon,380.0
1,Falcon,370.0
2,Parrot,24.0
3,Parrot,26.0


In [78]:
df.groupby(['Animal']).mean()

Unnamed: 0_level_0,Max Speed
Animal,Unnamed: 1_level_1
Falcon,375.0
Parrot,25.0


## Pivot

In [79]:
df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
                         "bar", "bar", "bar", "bar"],
                   "B": ["one", "one", "one", "two", "two",
                         "one", "one", "two", "two"],
                   "C": ["small", "large", "large", "small",
                         "small", "large", "small", "small",
                         "large"],
                   "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
                   "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
df

Unnamed: 0,A,B,C,D,E
0,foo,one,small,1,2
1,foo,one,large,2,4
2,foo,one,large,2,5
3,foo,two,small,3,5
4,foo,two,small,3,6
5,bar,one,large,4,6
6,bar,one,small,5,8
7,bar,two,small,6,9
8,bar,two,large,7,9


In [82]:
table = pd.pivot_table(df, values='D', index=['A', 'B'],
                       columns=['C'], aggfunc="sum", fill_value=0)
table

Unnamed: 0_level_0,C,large,small
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4,5
bar,two,7,6
foo,one,4,1
foo,two,0,6


# Crosstab

In [88]:
df = DataFrame({"total_bill":[16,10,21,23,24],"tip":[1,1,3,3,3],
                "sex":["F","M","M","M","F"],
                "smoker":["Yes","Yes","No","Yes","Yes"],
                "day":["Sun","Sun","Sun","Sun","Sun"] ,
                "time":["Dinner","Dinner","Dinner","Dinner","Dinner"],
                "size":[2,3,3,2,4]})
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16,1,F,Yes,Sun,Dinner,2
1,10,1,M,Yes,Sun,Dinner,3
2,21,3,M,No,Sun,Dinner,3
3,23,3,M,Yes,Sun,Dinner,2
4,24,3,F,Yes,Sun,Dinner,4


In [89]:
pd.crosstab(df.sex, df.smoker)

smoker,No,Yes
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,0,2
M,1,2
