# Exemplos adicionais de funções 

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = np.random.randn(100, 4)

In [3]:
df_data = pd.DataFrame(data, columns = ['A', 'B', 'C', 'D'])

In [4]:
df_data.head()

Unnamed: 0,A,B,C,D
0,1.254718,0.612672,0.1282,-0.349972
1,-1.273292,1.463142,0.619992,-2.095436
2,0.097849,0.934313,2.085413,-0.954994
3,0.982489,0.600781,-0.786963,0.959175
4,0.941875,-0.885992,-1.094204,1.485734


Salvando arquivo como csv

In [5]:
df_data.to_csv("data.csv", sep=(";"))

Lendo/importando arquivo csv

In [6]:
df_data_novo = pd.read_csv("data.csv", sep=";")

In [7]:
df_data_novo.head()

Unnamed: 0.1,Unnamed: 0,A,B,C,D
0,0,1.254718,0.612672,0.1282,-0.349972
1,1,-1.273292,1.463142,0.619992,-2.095436
2,2,0.097849,0.934313,2.085413,-0.954994
3,3,0.982489,0.600781,-0.786963,0.959175
4,4,0.941875,-0.885992,-1.094204,1.485734


=======================================================================================================================

Criando uma base - digitando cada um dos valores

In [8]:
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
        'age': [42, 52, 36, 24, 73], 
        'preTestScore': [4, 24, 31, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70]}


In [9]:
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,Jason,Miller,42,4,25
1,Molly,Jacobson,52,24,94
2,Tina,Ali,36,31,57
3,Jake,Milner,24,2,62
4,Amy,Cooze,73,3,70


========================================================================================================================

Buscando uma base na internet

In [10]:
gapminder_url='https://bit.ly/2cLzoxH'
gapminder = pd.read_csv(gapminder_url)
gapminder.head()

Unnamed: 0,country,year,pop,continent,lifeExp,gdpPercap
0,Afghanistan,1952,8425333.0,Asia,28.801,779.445314
1,Afghanistan,1957,9240934.0,Asia,30.332,820.85303
2,Afghanistan,1962,10267083.0,Asia,31.997,853.10071
3,Afghanistan,1967,11537966.0,Asia,34.02,836.197138
4,Afghanistan,1972,13079460.0,Asia,36.088,739.981106


========================================================================================================================

Selecionando somente um continente/categoria

In [11]:
gapminder_ocean = gapminder[(gapminder.year >2000) &
                            (gapminder.continent== 'Oceania')]
gapminder_ocean.shape
gapminder_ocean

Unnamed: 0,country,year,pop,continent,lifeExp,gdpPercap
70,Australia,2002,19546792.0,Oceania,80.37,30687.75473
71,Australia,2007,20434176.0,Oceania,81.235,34435.36744
1102,New Zealand,2002,3908037.0,Oceania,79.11,23189.80135
1103,New Zealand,2007,4115771.0,Oceania,80.204,25185.00911


========================================================================================================================

Excluindo uma variável/coluna

In [12]:
gapminder_ocean.drop(['continent'], axis=1)

Unnamed: 0,country,year,pop,lifeExp,gdpPercap
70,Australia,2002,19546792.0,80.37,30687.75473
71,Australia,2007,20434176.0,81.235,34435.36744
1102,New Zealand,2002,3908037.0,79.11,23189.80135
1103,New Zealand,2007,4115771.0,80.204,25185.00911


Excluindo (drop) várias colunas em um data frame

In [13]:
gapminder_ocean.drop(['pop', 'gdpPercap', 'continent'], axis=1)

Unnamed: 0,country,year,lifeExp
70,Australia,2002,80.37
71,Australia,2007,81.235
1102,New Zealand,2002,79.11
1103,New Zealand,2007,80.204


Excluindo linhas específica

In [14]:
gapminder_ocean.drop([70,71],axis=0)

Unnamed: 0,country,year,pop,continent,lifeExp,gdpPercap
1102,New Zealand,2002,3908037.0,Oceania,79.11,23189.80135
1103,New Zealand,2007,4115771.0,Oceania,80.204,25185.00911


Recarregando a base

In [15]:
gapminder_url='https://bit.ly/2cLzoxH'
gapminder = pd.read_csv(gapminder_url)
gapminder.head()

Unnamed: 0,country,year,pop,continent,lifeExp,gdpPercap
0,Afghanistan,1952,8425333.0,Asia,28.801,779.445314
1,Afghanistan,1957,9240934.0,Asia,30.332,820.85303
2,Afghanistan,1962,10267083.0,Asia,31.997,853.10071
3,Afghanistan,1967,11537966.0,Asia,34.02,836.197138
4,Afghanistan,1972,13079460.0,Asia,36.088,739.981106


========================================================================================================================

Extraindo a frequencia de observações

In [16]:
gapminder['continent'].value_counts()

Africa      624
Asia        396
Europe      360
Americas    300
Oceania      24
Name: continent, dtype: int64

Salvando o arquivo gapminder no diretorio do notebook ipython

In [17]:
gapminder.to_csv("gapminder.csv")

Descobrindo os formatos das variáveis

In [18]:
print(gapminder.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
country      1704 non-null object
year         1704 non-null int64
pop          1704 non-null float64
continent    1704 non-null object
lifeExp      1704 non-null float64
gdpPercap    1704 non-null float64
dtypes: float64(3), int64(1), object(2)
memory usage: 80.0+ KB
None


In [19]:
gapminder.dtypes

country       object
year           int64
pop          float64
continent     object
lifeExp      float64
gdpPercap    float64
dtype: object

========================================================================================================================

Extraindo os valores únicos de uma variável/coluna

In [20]:
gapminder['continent'].unique()

array(['Asia', 'Europe', 'Africa', 'Americas', 'Oceania'], dtype=object)

Transformando em uma lista 

In [21]:
gapminder['continent'].unique().tolist()

['Asia', 'Europe', 'Africa', 'Americas', 'Oceania']

Extraindo novamente os valores unicos

In [22]:
gapminder['country'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
       'Australia', 'Austria', 'Bahrain', 'Bangladesh', 'Belgium',
       'Benin', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo Dem. Rep.', 'Congo Rep.',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Czech Republic',
       'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Ethiopia',
       'Finland', 'France', 'Gabon', 'Gambia', 'Germany', 'Ghana',
       'Greece', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Haiti',
       'Honduras', 'Hong Kong China', 'Hungary', 'Iceland', 'India',
       'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy',
       'Jamaica', 'Japan', 'Jordan', 'Kenya', 'Korea Dem. Rep.',
       'Korea Rep.', 'Kuwait', 'Lebanon',

Contando os valores únicos

In [23]:
len(gapminder['country'].unique().tolist())

142

In [24]:
len(gapminder['continent'].unique().tolist())

5

# Realizando agregação de dados

Carregando as bibliotecas novamente

In [25]:
import pandas as pd
from IPython.display import display
from IPython.display import Image

Criando a primeira base (dataframe)

In [26]:
raw_data = {
        'subject_id': ['1', '2', '3', '4', '5'],
        'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'], 
        'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']}
df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name'])
df_a

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches


Criando a segunda base

In [27]:
raw_data = {
        'subject_id': ['4', '5', '6', '7', '8'],
        'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'], 
        'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']}
df_b = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name'])
df_b

Unnamed: 0,subject_id,first_name,last_name
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


Criando a terceira base

In [28]:
raw_data = {
        'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
        'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]}
df_n = pd.DataFrame(raw_data, columns = ['subject_id','test_id'])
df_n

Unnamed: 0,subject_id,test_id
0,1,51
1,2,15
2,3,15
3,4,61
4,5,16
5,7,14
6,8,15
7,9,1
8,10,61
9,11,16


Agregando bases de dados por concatenação de linhas

In [29]:
df_new = pd.concat([df_a, df_b])
df_new

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


Agregando bases de dados por concatenação de colunas

In [30]:
pd.concat([df_a, df_b], axis=1)

Unnamed: 0,subject_id,first_name,last_name,subject_id.1,first_name.1,last_name.1
0,1,Alex,Anderson,4,Billy,Bonder
1,2,Amy,Ackerman,5,Brian,Black
2,3,Allen,Ali,6,Bran,Balwner
3,4,Alice,Aoni,7,Bryce,Brice
4,5,Ayoung,Atiches,8,Betty,Btisan


Agregando dois dataframes considerando chave primaria

In [31]:
pd.merge(df_new, df_n, on='subject_id')

Unnamed: 0,subject_id,first_name,last_name,test_id
0,1,Alex,Anderson,51
1,2,Amy,Ackerman,15
2,3,Allen,Ali,15
3,4,Alice,Aoni,61
4,4,Billy,Bonder,61
5,5,Ayoung,Atiches,16
6,5,Brian,Black,16
7,7,Bryce,Brice,14
8,8,Betty,Btisan,15


Junte os dois quadros de dados ao longo das colunas

In [32]:
pd.concat([df_a, df_b], axis=1)

Unnamed: 0,subject_id,first_name,last_name,subject_id.1,first_name.1,last_name.1
0,1,Alex,Anderson,4,Billy,Bonder
1,2,Amy,Ackerman,5,Brian,Black
2,3,Allen,Ali,6,Bran,Balwner
3,4,Alice,Aoni,7,Bryce,Brice
4,5,Ayoung,Atiches,8,Betty,Btisan


Mesclar dois quadros de dados ao longo do valor subject_id

In [33]:
pd.merge(df_new, df_n, on='subject_id')

Unnamed: 0,subject_id,first_name,last_name,test_id
0,1,Alex,Anderson,51
1,2,Amy,Ackerman,15
2,3,Allen,Ali,15
3,4,Alice,Aoni,61
4,4,Billy,Bonder,61
5,5,Ayoung,Atiches,16
6,5,Brian,Black,16
7,7,Bryce,Brice,14
8,8,Betty,Btisan,15


Mesclar com junção externa (Outer Inner Join)

“A junção externa completa produz o conjunto de todos os registros na Tabela A e na Tabela B,
com registros correspondentes de ambos os lados, quando disponíveis. 
Se não houver correspondência, o lado ausente conterá nulo.”

In [34]:
pd.merge(df_a, df_b, on='subject_id', how='outer')

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,1,Alex,Anderson,,
1,2,Amy,Ackerman,,
2,3,Allen,Ali,,
3,4,Alice,Aoni,Billy,Bonder
4,5,Ayoung,Atiches,Brian,Black
5,6,,,Bran,Balwner
6,7,,,Bryce,Brice
7,8,,,Betty,Btisan



Mesclar com junção interna (Inner Join)


“Junção interna produz apenas o conjunto de registros que correspondem na Tabela A e na Tabela B.”

In [35]:
pd.merge(df_a, df_b, on='subject_id', how='inner')

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,4,Alice,Aoni,Billy,Bonder
1,5,Ayoung,Atiches,Brian,Black


Mesclar com junção direita (right join)

In [36]:
pd.merge(df_a, df_b, on='subject_id', how='right')

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,4,Alice,Aoni,Billy,Bonder
1,5,Ayoung,Atiches,Brian,Black
2,6,,,Bran,Balwner
3,7,,,Bryce,Brice
4,8,,,Betty,Btisan



Mesclar com junção à esquerda

"Junção externa esquerda produz um conjunto completo de registros da Tabela A, 
com os registros correspondentes (quando disponíveis) na Tabela B. 
Se não houver correspondência, o lado direito conterá nulo."


In [37]:
pd.merge(df_a, df_b, on='subject_id', how='left')

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,1,Alex,Anderson,,
1,2,Amy,Ackerman,,
2,3,Allen,Ali,,
3,4,Alice,Aoni,Billy,Bonder
4,5,Ayoung,Atiches,Brian,Black




Mesclar e adicionar um sufixo para duplicar os nomes das colunas


In [38]:
pd.merge(df_a, df_b, on='subject_id', how='left', suffixes=('_left', '_right'))

Unnamed: 0,subject_id,first_name_left,last_name_left,first_name_right,last_name_right
0,1,Alex,Anderson,,
1,2,Amy,Ackerman,,
2,3,Allen,Ali,,
3,4,Alice,Aoni,Billy,Bonder
4,5,Ayoung,Atiches,Brian,Black


Mesclar baseado em indices

In [39]:
pd.merge(df_a, df_b, right_index=True, left_index=True)

Unnamed: 0,subject_id_x,first_name_x,last_name_x,subject_id_y,first_name_y,last_name_y
0,1,Alex,Anderson,4,Billy,Bonder
1,2,Amy,Ackerman,5,Brian,Black
2,3,Allen,Ali,6,Bran,Balwner
3,4,Alice,Aoni,7,Bryce,Brice
4,5,Ayoung,Atiches,8,Betty,Btisan


# Realizando o cruzamento de dados

Criando uma base de dados

In [40]:
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'], 
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'], 
        'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'], 
        'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore
0,Nighthawks,1st,Miller,4,25
1,Nighthawks,1st,Jacobson,24,94
2,Nighthawks,2nd,Ali,31,57
3,Nighthawks,2nd,Milner,2,62
4,Dragoons,1st,Cooze,3,70
5,Dragoons,1st,Jacon,4,25
6,Dragoons,2nd,Ryaner,24,94
7,Dragoons,2nd,Sone,31,57
8,Scouts,1st,Sloan,2,62
9,Scouts,1st,Piger,3,70


Criando um agrupamento (groupby variable) de preTestScores por regiment

In [41]:

groupby_regiment = df['preTestScore'].groupby(df['regiment'])
groupby_regiment

<pandas.core.groupby.SeriesGroupBy object at 0x000001E3777936A0>

In [42]:
list(df['preTestScore'].groupby(df['regiment']))

[('Dragoons', 4     3
  5     4
  6    24
  7    31
  Name: preTestScore, dtype: int64), ('Nighthawks', 0     4
  1    24
  2    31
  3     2
  Name: preTestScore, dtype: int64), ('Scouts', 8     2
  9     3
  10    2
  11    3
  Name: preTestScore, dtype: int64)]

Estatistica descritivas 

In [43]:
df['preTestScore'].groupby(df['regiment']).describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
regiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Dragoons,4.0,15.5,14.153916,3.0,3.75,14.0,25.75,31.0
Nighthawks,4.0,15.25,14.45395,2.0,3.5,14.0,25.75,31.0
Scouts,4.0,2.5,0.57735,2.0,2.0,2.5,3.0,3.0


In [44]:
groupby_regiment.mean()

regiment
Dragoons      15.50
Nighthawks    15.25
Scouts         2.50
Name: preTestScore, dtype: float64

In [45]:
df['preTestScore'].groupby([df['regiment'], df['company']]).mean()

regiment    company
Dragoons    1st         3.5
            2nd        27.5
Nighthawks  1st        14.0
            2nd        16.5
Scouts      1st         2.5
            2nd         2.5
Name: preTestScore, dtype: float64

In [46]:
df['preTestScore'].groupby([df['regiment'], df['company']]).mean().unstack()

company,1st,2nd
regiment,Unnamed: 1_level_1,Unnamed: 2_level_1
Dragoons,3.5,27.5
Nighthawks,14.0,16.5
Scouts,2.5,2.5


In [47]:
df.groupby(['regiment', 'company']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,preTestScore,postTestScore
regiment,company,Unnamed: 2_level_1,Unnamed: 3_level_1
Dragoons,1st,3.5,47.5
Dragoons,2nd,27.5,75.5
Nighthawks,1st,14.0,59.5
Nighthawks,2nd,16.5,59.5
Scouts,1st,2.5,66.0
Scouts,2nd,2.5,66.0


In [48]:
df.groupby(['regiment', 'company']).size()

regiment    company
Dragoons    1st        2
            2nd        2
Nighthawks  1st        2
            2nd        2
Scouts      1st        2
            2nd        2
dtype: int64

Interações sobre grupos

Agrupando por regimentos 

In [49]:
# agrupa o dataframe por regimento
for name, group in df.groupby('regiment'): 
    # imprime o nome do regimento
    print(name)
    # imprime a data do regimento
    print(group)

Dragoons
   regiment company    name  preTestScore  postTestScore
4  Dragoons     1st   Cooze             3             70
5  Dragoons     1st   Jacon             4             25
6  Dragoons     2nd  Ryaner            24             94
7  Dragoons     2nd    Sone            31             57
Nighthawks
     regiment company      name  preTestScore  postTestScore
0  Nighthawks     1st    Miller             4             25
1  Nighthawks     1st  Jacobson            24             94
2  Nighthawks     2nd       Ali            31             57
3  Nighthawks     2nd    Milner             2             62
Scouts
   regiment company   name  preTestScore  postTestScore
8    Scouts     1st  Sloan             2             62
9    Scouts     1st  Piger             3             70
10   Scouts     2nd  Riani             2             62
11   Scouts     2nd    Ali             3             70


In [50]:
# agrupa os regimentos por data
for name, group in df.groupby('regiment'): 
    # imprime a data do regimento
    print(group)

   regiment company    name  preTestScore  postTestScore
4  Dragoons     1st   Cooze             3             70
5  Dragoons     1st   Jacon             4             25
6  Dragoons     2nd  Ryaner            24             94
7  Dragoons     2nd    Sone            31             57
     regiment company      name  preTestScore  postTestScore
0  Nighthawks     1st    Miller             4             25
1  Nighthawks     1st  Jacobson            24             94
2  Nighthawks     2nd       Ali            31             57
3  Nighthawks     2nd    Milner             2             62
   regiment company   name  preTestScore  postTestScore
8    Scouts     1st  Sloan             2             62
9    Scouts     1st  Piger             3             70
10   Scouts     2nd  Riani             2             62
11   Scouts     2nd    Ali             3             70


Especificamente neste caso: agrupe pelos tipos de dados das colunas (ou seja, eixo = 1) e, em seguida, use list () para visualizar como esse agrupamento se parece

In [51]:
list(df.groupby(df.dtypes, axis=1))

[(dtype('int64'),     preTestScore  postTestScore
  0              4             25
  1             24             94
  2             31             57
  3              2             62
  4              3             70
  5              4             25
  6             24             94
  7             31             57
  8              2             62
  9              3             70
  10             2             62
  11             3             70),
 (dtype('O'),       regiment company      name
  0   Nighthawks     1st    Miller
  1   Nighthawks     1st  Jacobson
  2   Nighthawks     2nd       Ali
  3   Nighthawks     2nd    Milner
  4     Dragoons     1st     Cooze
  5     Dragoons     1st     Jacon
  6     Dragoons     2nd    Ryaner
  7     Dragoons     2nd      Sone
  8       Scouts     1st     Sloan
  9       Scouts     1st     Piger
  10      Scouts     2nd     Riani
  11      Scouts     2nd       Ali)]

No dataframe “df”, agrupe por “regimentos, pegue os valores médios das outras variáveis ​​para esses grupos e, em seguida, exiba-os com o prefixo_mean

In [52]:
df.groupby('regiment').mean().add_prefix('mean_')

Unnamed: 0_level_0,mean_preTestScore,mean_postTestScore
regiment,Unnamed: 1_level_1,Unnamed: 2_level_1
Dragoons,15.5,61.5
Nighthawks,15.25,59.5
Scouts,2.5,66.0


Criando uma função para obter as estatísticas dos grupos

In [53]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()}

criando as categorias e definindo os valores 

In [54]:
bins = [0, 25, 50, 75, 100]
group_names = ['Low', 'Okay', 'Good', 'Great']
df['categories'] = pd.cut(df['postTestScore'], bins, labels=group_names)

Aplicando a função get_stats() para cada categoria de postTestScore

In [55]:
df['postTestScore'].groupby(df['categories']).apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Low,2.0,25.0,25.0,25.0
Okay,0.0,,,
Good,8.0,70.0,63.75,57.0
Great,2.0,94.0,94.0,94.0


# Criando uma variável em um dataframe

Criando um dataframe vazio

In [56]:
df = pd.DataFrame()


df['nomes'] = ['John', 'Steve', 'Sarah']


df

Unnamed: 0,nomes
0,John
1,Steve
2,Sarah


criando uma nova coluna com as idades

In [58]:

df.assign(idade = [31, 32, 19])

Unnamed: 0,nomes,idade
0,John,31
1,Steve,32
2,Sarah,19


# Estatísticas Descritivas

Criando uma base de dados

In [None]:
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'age': [42, 52, 36, 24, 73], 
        'preTestScore': [4, 24, 31, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(data, columns = ['name', 'age', 'preTestScore', 'postTestScore'])
df

In [None]:
df['age'].sum()

In [None]:
df['preTestScore'].mean()

In [None]:
df['preTestScore'].cumsum()

In [None]:
df['preTestScore'].describe()

In [None]:
df['preTestScore'].count()

In [None]:
df['preTestScore'].min()

In [None]:
df['preTestScore'].max()

In [None]:
df['preTestScore'].median()

In [None]:
df['preTestScore'].var()

In [None]:
df['preTestScore'].std()

In [None]:
df['preTestScore'].skew()

In [None]:
df['preTestScore'].kurt()

In [None]:
df.corr()

In [None]:
df.cov()

=========================================================================================================

# Detectando Outliers

In [None]:
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

Criando dados simulados

In [None]:
X, _ = make_blobs(n_samples = 10,
                  n_features = 2,
                  centers = 1,
                  random_state = 1)

Substituindo os valores

In [None]:
X[0,0] = 10000
X[0,1] = 10000

In [None]:
X

In [None]:
# Criando um detector
outlier_detector = EllipticEnvelope(contamination=.1)

# Ajustando o detector
outlier_detector.fit(X)

# Predizendo o outlier
outlier_detector.predict(X)

In [None]:

data2 = {'nome': ['Joao', 'Cassia', 'Tina', 'Julio', 'Maria'], 
        'idade': [42, 52, 36, 24, 73], 
        'preTestScore': [4, 24, 31, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70]}
dfnovo = pd.DataFrame(data2, columns = ['nome', 'idade', 'preTestScore', 'postTestScore'])
dfnovo


In [None]:
import random

In [None]:
amostra = dfnovo.sample(2)

In [None]:
amostra
