### Использование Pandas для работы с данными на примере датасета Titanic

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('data\\titanic_data.csv', index_col='PassengerId')
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# Возвращает Series с количеством уникальных элементов
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [3]:
# Выводит общую статистику
df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


### 1) Number of man and women on Titanic

In [4]:
num_men = df.Sex.loc[df.Sex == 'male'].count()
num_women = df.Sex.loc[df.Sex == 'female'].count()
print('Number of men: %g\nNumber of women: %g' % (num_men, num_women))

Number of men: 577
Number of women: 314


### 2) Fraction of survived people (0 -- no, 1 -- yes)

In [5]:
people = df.Survived
frac_surv = people.loc[people == 1].count()/len(people)*100
print('Percent of survived people: %g' % frac_surv)

Percent of survived people: 38.3838


### 3) Fraction of the passengers from the 1-st class

In [6]:
class_data = df['Pclass']
frac_frst_class = class_data.loc[class_data == 1].count()/len(class_data)*100
print('Percent of people from the 1-st class: %g' % frac_frst_class)

Percent of people from the 1-st class: 24.2424


### 4) Mean and median values of the passenger's age

In [7]:
mean_age = df['Age'].mean()
median_age = df['Age'].median()
print('Mean age: %g\nMedian age: %g' % (mean_age, median_age))


Mean age: 29.6991
Median age: 28


### 5) Pearson correlation between SibSp and Parch columns

In [8]:
correlation = df['SibSp'].corr(df['Parch'])
print('Pearson correlation: %g' % correlation)

Pearson correlation: 0.414838


### 6) The most popular female name

In [9]:
def find_name(arr):
    """
    Function for obtaining the name
    """
    name = []
    for ii in range(len(arr)):
        if arr[ii] == 'Miss.':
            name = arr[ii+1]
            break
        elif arr[ii][0] == '(':
            if ii < len(arr)-1:
                name = arr[ii][1:]
            if ii == len(arr)-1:
                name = arr[ii][1:-1]
            break
    return name
    
# List of female full names
fem_name_set = df.Name.loc[df.Sex == 'female']
# Form a list of first names
name = []
for ii in fem_name_set.index:
    pass_name = fem_name_set[ii]
    name.append(find_name(pass_name.split()))
# Create a Series of first names
names_data = pd.Series(name)
most_common_name = names_data.astype('str').value_counts().index[0]
print(most_common_name)

Anna


In [10]:
# Можно группировать данные по какому-то признаку
df.groupby(['Pclass']).mean()

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.62963,38.233441,0.416667,0.356481,84.154687
2,0.472826,29.87763,0.402174,0.380435,20.662183
3,0.242363,25.14062,0.615071,0.393075,13.67555
