In [1]:
import pandas as pd

# Основные структуры данных

In [7]:
data = ['Text', 20, 30, 40, 50]

In [8]:
data

['Text', 20, 30, 40, 50]

In [9]:
# Series 
# Одномерные данные

my_series = pd.Series(data)

In [10]:
my_series

0    Text
1      20
2      30
3      40
4      50
dtype: object

In [49]:
data_df = {
    'Name': ['Alice', 'Rimma'],
    'Age': [21, 24],
    'Describe': ['', 'Yep']
}

In [50]:
# DataFrame
# Двумерные данные

my_df = pd.DataFrame(data_df, index = ['a','b'])

In [51]:
my_df

Unnamed: 0,Name,Age,Describe
0,Alice,21,
1,Rimma,24,Yep


In [52]:
my_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Name      2 non-null      object
 1   Age       2 non-null      int64 
 2   Describe  2 non-null      object
dtypes: int64(1), object(2)
memory usage: 180.0+ bytes


## DataFrame в другие источники

In [53]:
# CSV

In [54]:
# to_csv
my_df.to_csv('1.csv', index = False)

In [55]:
# Excel

In [56]:
# to_excel
my_df.to_excel('1.xlsx', index = False)

## DataFrame из других источников

In [57]:
# CSV

In [58]:
# read_csv
my_df = pd.read_csv('1.csv')

In [59]:
my_df

Unnamed: 0,Name,Age,Describe
0,Alice,21,
1,Rimma,24,Yep


In [60]:
# Excel

In [91]:
# read_excel
my_df = pd.read_excel('1.xlsx')

In [92]:
my_df

Unnamed: 0,Name,Age,Describe
0,Alice,21,
1,Rimma,24,Yep


# Основные методы DF

In [63]:
my_df.head(1)

Unnamed: 0,Name,Age,Describe
0,Alice,21,


In [64]:
my_df.tail(1)

Unnamed: 0,Name,Age,Describe
1,Rimma,24,Yep


In [65]:
my_df.shape

(2, 3)

In [66]:
my_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Name      2 non-null      object
 1   Age       2 non-null      int64 
 2   Describe  1 non-null      object
dtypes: int64(1), object(2)
memory usage: 180.0+ bytes


In [67]:
my_df.describe()

Unnamed: 0,Age
count,2.0
mean,22.5
std,2.12132
min,21.0
25%,21.75
50%,22.5
75%,23.25
max,24.0


# Индексация

In [78]:
data_df = {
    'Name': ['Alice', 'Rimma'],
    'Age': [21, 24],
    'Describe': ['', 'Yep']
}

In [79]:
my_df = pd.DataFrame(data_df, index = ['a','b'])

## По столбцам

In [81]:
my_df

Unnamed: 0,Name,Age,Describe
a,Alice,21,
b,Rimma,24,Yep


In [82]:
my_df['Name']

a    Alice
b    Rimma
Name: Name, dtype: object

In [83]:
my_df['Age']

a    21
b    24
Name: Age, dtype: int64

In [84]:
my_df[my_df['Age'] > 22]

Unnamed: 0,Name,Age,Describe
b,Rimma,24,Yep


## По строкам

In [85]:
my_df

Unnamed: 0,Name,Age,Describe
a,Alice,21,
b,Rimma,24,Yep


In [87]:
my_df.loc['b']

Name        Rimma
Age            24
Describe      Yep
Name: b, dtype: object

In [88]:
my_df.iloc[1]

Name        Rimma
Age            24
Describe      Yep
Name: b, dtype: object

# Работа с пропусками

## Пойск пропусков

In [93]:
my_df

Unnamed: 0,Name,Age,Describe
0,Alice,21,
1,Rimma,24,Yep


In [94]:
my_df['Describe'].isna()

0     True
1    False
Name: Describe, dtype: bool

In [96]:
my_df[my_df['Describe'].isna()]

Unnamed: 0,Name,Age,Describe
0,Alice,21,


In [95]:
my_df['Describe'].notna()

0    False
1     True
Name: Describe, dtype: bool

In [97]:
my_df[my_df['Describe'].notna()]

Unnamed: 0,Name,Age,Describe
1,Rimma,24,Yep


## Удаление и заполнение пропусков

In [102]:
my_df_fillna = my_df.fillna('Nope')

In [103]:
my_df_fillna

Unnamed: 0,Name,Age,Describe
0,Alice,21,Nope
1,Rimma,24,Yep


In [108]:
my_df_dropna = my_df.dropna()

In [109]:
my_df_dropna

Unnamed: 0,Name,Age,Describe
1,Rimma,24,Yep


# Группировки

In [114]:
my_df.groupby('Name')['Age'].value_counts()

Name   Age
Alice  21     1
Rimma  24     1
Name: count, dtype: int64

# Агрегация

In [117]:
my_df['Age'].median()

np.float64(22.5)

# Сортировка данных

In [121]:
my_df.sort_values(by=['Age', 'Name'], ascending=False)

Unnamed: 0,Name,Age,Describe
1,Rimma,24,Yep
0,Alice,21,


# Объединие

In [123]:
my_df

Unnamed: 0,Name,Age,Describe
0,Alice,21,
1,Rimma,24,Yep


In [129]:
data_df_1 = {
    'FIO': ['Alice', 'Rimma'],
    'Salary': [0, 2400000]
}

In [130]:
my_df_1 = pd.DataFrame(data_df_1)

In [132]:
merged = pd.merge(my_df, my_df_1, left_on='Name', right_on='FIO')

In [133]:
merged

Unnamed: 0,Name,Age,Describe,FIO,Salary
0,Alice,21,,Alice,0
1,Rimma,24,Yep,Rimma,2400000


In [134]:
concated = pd.concat([my_df, my_df_1])

In [135]:
concated

Unnamed: 0,Name,Age,Describe,FIO,Salary
0,Alice,21.0,,,
1,Rimma,24.0,Yep,,
0,,,,Alice,0.0
1,,,,Rimma,2400000.0
