# Estructuras de datos

## Series

In [1]:
import pandas as pd
import numpy as np

In [2]:
valores = np.array([1,2,3,4,5])
serie = pd.Series(valores)
serie

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [3]:
serie.index

RangeIndex(start=0, stop=5, step=1)

In [4]:
serie = pd.Series(valores, index = ['a','b','c','d','e'])
serie

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [5]:
serie.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [6]:
d = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5}
serie = pd.Series(d)
serie

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [9]:
serie = pd.Series(d, index = ['b','c','a','e','d'])
serie

b    2
c    3
a    1
e    5
d    4
dtype: int64

In [10]:
np.sum(serie)

15

In [11]:
np.max(serie)

5

In [12]:
serie

b    2
c    3
a    1
e    5
d    4
dtype: int64

In [13]:
serie['b']

2

In [16]:
serie[0]

2

In [17]:
serie.dtype

dtype('int64')

In [18]:
array = serie.to_numpy()
print(type(array))
array

<class 'numpy.ndarray'>


array([2, 3, 1, 5, 4])

In [19]:
serie

b    2
c    3
a    1
e    5
d    4
dtype: int64

In [22]:
serie['e']

5

In [24]:
serie['f'] = 6
serie

b    2
c    3
a    1
e    5
d    4
f    6
dtype: int64

In [25]:
'e' in serie

True

In [26]:
'h' in serie

False

In [29]:
serie.keys()

Index(['b', 'c', 'a', 'e', 'd', 'f'], dtype='object')

In [33]:
serie.values

array([2, 3, 1, 5, 4, 6])

In [34]:
serie

b    2
c    3
a    1
e    5
d    4
f    6
dtype: int64

In [35]:
serie * 2

b     4
c     6
a     2
e    10
d     8
f    12
dtype: int64

In [36]:
serie.keys()

Index(['b', 'c', 'a', 'e', 'd', 'f'], dtype='object')

In [40]:
serie2 = pd.Series(np.random.randint(0,10,6), index = ['b', 'c', 'a', 'e', 'd', 'f'])
serie2

b    2
c    4
a    0
e    7
d    4
f    8
dtype: int64

In [41]:
serie + serie2

b     4
c     7
a     1
e    12
d     8
f    14
dtype: int64

In [42]:
serie2['g'] = 10

In [43]:
serie

b    2
c    3
a    1
e    5
d    4
f    6
dtype: int64

In [44]:
serie2

b     2
c     4
a     0
e     7
d     4
f     8
g    10
dtype: int64

In [45]:
serie + serie2

a     1.0
b     4.0
c     7.0
d     8.0
e    12.0
f    14.0
g     NaN
dtype: float64

## DataFrame

In [2]:
import numpy as np
import pandas as pd

In [3]:
d = {'col1': pd.Series([1., 2., 3., 4.]),
     'col2': pd.Series([4., 3., 2., 1.])}

df = pd.DataFrame(d)
df

Unnamed: 0,col1,col2
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [4]:
d = {'col1': [1., 2., 3., 4.],
     'col2': [4., 3., 2., 1.]}

df = pd.DataFrame(d, index = ['a','b','c','e'])
df

Unnamed: 0,col1,col2
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
e,4.0,1.0


In [5]:
l = [{'col1': 1, 'col2' : 4}, {'col1': 2, 'col2' : 3},  
     {'col1': 3, 'col2' : 2}, {'col1': 4, 'col2' : 1, 'col3' : 3}]

df = pd.DataFrame(l, index = ['a','b','c','e'])
df

Unnamed: 0,col1,col2,col3
a,1,4,
b,2,3,
c,3,2,
e,4,1,3.0


In [6]:
df

Unnamed: 0,col1,col2,col3
a,1,4,
b,2,3,
c,3,2,
e,4,1,3.0


In [7]:
df['col2']

a    4
b    3
c    2
e    1
Name: col2, dtype: int64

In [70]:
type(df['col2'])

pandas.core.series.Series

In [69]:
df['col4'] = df['col1'] + df['col2']
df

Unnamed: 0,col1,col2,col3,col4
a,1,4,,5
b,2,3,,5
c,3,2,,5
e,4,1,3.0,5


In [71]:
del df['col3']
df

Unnamed: 0,col1,col2,col4
a,1,4,5
b,2,3,5
c,3,2,5
e,4,1,5


In [73]:
df.insert(0, 'col0', df['col4'] + df['col2'])
df

Unnamed: 0,col0,col1,col2,col4
a,9,1,4,5
b,8,2,3,5
c,7,3,2,5
e,6,4,1,5


In [74]:
df = df.assign(col3 = 3)
df

Unnamed: 0,col0,col1,col2,col4,col3
a,9,1,4,5,3
b,8,2,3,5,3
c,7,3,2,5,3
e,6,4,1,5,3


## Lectura y escritura

In [11]:
import numpy as np
import pandas as pd

In [12]:
from pathlib import Path
data_path = Path('./data')

In [13]:
dataset = pd.read_csv(data_path / 'dataset.csv', sep = ',')
dataset.head()

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No
4,5,Dallas,Male,46,50289.0,No


In [14]:
pd.read_csv(data_path / 'dataset.csv', sep = ',', usecols=['Gender','Age']).head()


Unnamed: 0,Gender,Age
0,Male,41
1,Male,54
2,Male,42
3,Male,40
4,Male,46


In [15]:
dataset.head(100).to_json(data_path / 'dataset_records.json', orient='records')

In [16]:
ds = pd.read_json(data_path / 'dataset_index.json', orient='index')
ds.head()

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367,No
1,2,Dallas,Male,54,45084,No
2,3,Dallas,Male,42,52483,No
3,4,Dallas,Male,40,40941,No
4,5,Dallas,Male,46,50289,No


In [19]:
ds = pd.read_json(data_path / 'dataset_records.json', orient='index')
ds.head()

AttributeError: 'list' object has no attribute 'values'

In [99]:
ds = pd.read_json(data_path / 'dataset_records.json', orient='records')
ds.head()

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367,No
1,2,Dallas,Male,54,45084,No
2,3,Dallas,Male,42,52483,No
3,4,Dallas,Male,40,40941,No
4,5,Dallas,Male,46,50289,No


In [108]:
widths = [1, 6, 4, 2, 7, 2]
ds_fijo = pd.read_fwf(data_path / 'dataset_fijo', widths=widths, header = None)
ds_fijo.columns = ['Number', 'City', 'Gender', 'Age', 'Income', 'Illness']
ds_fijo

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No
4,5,Dallas,Male,46,50289.0,No


In [114]:
ds_parquet = pd.read_parquet(data_path / 'dataset.parquet', engine='pyarrow')
ds_parquet.head()

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No
4,5,Dallas,Male,46,50289.0,No


In [115]:
df_to_write = dataset.head(100)
df_to_write.head()

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No
4,5,Dallas,Male,46,50289.0,No


In [116]:
df_to_write.to_csv(data_path / 'df_to_write.csv', index = False, sep = '#', header = False)

In [117]:
df_to_write.to_json(data_path / 'df_to_write.json', orient = 'split')

In [120]:
df_to_write.to_parquet(data_path / 'df_to_write.parquet', engine = 'pyarrow')

## Indexado y selección de datos - Parte I

In [20]:
import numpy as np
import pandas as pd

from pathlib import Path
data_path = Path('./data')

In [21]:
dataset = pd.read_csv(data_path / 'dataset.csv', sep = ',')
dataset.head()

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No
4,5,Dallas,Male,46,50289.0,No


In [22]:
dataset['City'].value_counts()

City
New York City      50307
Los Angeles        32173
Dallas             19707
Mountain View      14219
Austin             12292
Boston              8301
Washington D.C.     8120
San Diego           4881
Name: count, dtype: int64

In [23]:
dataset['City'] == 'Dallas'

0          True
1          True
2          True
3          True
4          True
          ...  
149995    False
149996    False
149997    False
149998    False
149999    False
Name: City, Length: 150000, dtype: bool

In [24]:
dallas_city = dataset[dataset['City'] == 'Dallas']
dallas_city['City'].value_counts()

City
Dallas    19707
Name: count, dtype: int64

In [25]:
dallas_female = dataset[(dataset['City'] == 'Dallas') & (dataset['Gender'] == 'Female')]
dallas_female['Gender'].value_counts()

Gender
Female    8705
Name: count, dtype: int64

In [26]:
dataset.head()

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No
4,5,Dallas,Male,46,50289.0,No


In [27]:
dataset.loc[1]

Number           2
City        Dallas
Gender        Male
Age             54
Income     45084.0
Illness         No
Name: 1, dtype: object

In [28]:
dataset1 = dataset.set_index(np.arange(10, len(dataset)+10))
dataset1.head()

Unnamed: 0,Number,City,Gender,Age,Income,Illness
10,1,Dallas,Male,41,40367.0,No
11,2,Dallas,Male,54,45084.0,No
12,3,Dallas,Male,42,52483.0,No
13,4,Dallas,Male,40,40941.0,No
14,5,Dallas,Male,46,50289.0,No


In [30]:
dataset1.loc[1]

KeyError: 1

In [51]:
dataset1.loc[10]

Number          1
City       Dallas
Gender       Male
Age            41
Income      40367
Illness        No
Name: 10, dtype: object

In [52]:
dataset1.loc[10, 'City']

'Dallas'

In [53]:
dataset1.loc[10, ['City', 'Age']]

City    Dallas
Age         41
Name: 10, dtype: object

In [54]:
dataset1.loc[:, ['City','Age']]

Unnamed: 0,City,Age
10,Dallas,41
11,Dallas,54
12,Dallas,42
13,Dallas,40
14,Dallas,46
...,...,...
150005,Austin,48
150006,Austin,25
150007,Austin,26
150008,Austin,25


In [55]:
dataset1.iloc[0]

Number          1
City       Dallas
Gender       Male
Age            41
Income      40367
Illness        No
Name: 10, dtype: object

In [56]:
dataset1.loc[10]

Number          1
City       Dallas
Gender       Male
Age            41
Income      40367
Illness        No
Name: 10, dtype: object

In [66]:
dataset1.iloc[0, 1]

'Dallas'

In [68]:
dataset.iloc[0, 1:3]

City      Dallas
Gender      Male
Name: 0, dtype: object

In [74]:
dataset.iat[0,1]

'Dallas'

In [80]:
dataset.loc[0, 'City']

'Dallas'

In [81]:
dataset.at[0, 'City']

'Dallas'

In [57]:
dataset.head()

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No
4,5,Dallas,Male,46,50289.0,No


In [61]:
dataset.loc[1:3]

Unnamed: 0,Number,City,Gender,Age,Income,Illness
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No


In [59]:
dataset.loc[4:]

Unnamed: 0,Number,City,Gender,Age,Income,Illness
4,5,Dallas,Male,46,50289.0,No
5,6,Dallas,Female,36,50786.0,No
6,7,Dallas,Female,32,33155.0,No
7,8,Dallas,Male,39,30914.0,No
8,9,Dallas,Male,51,68667.0,No
...,...,...,...,...,...,...
149995,149996,Austin,Male,48,93669.0,No
149996,149997,Austin,Male,25,96748.0,No
149997,149998,Austin,Male,26,111885.0,No
149998,149999,Austin,Male,25,111878.0,No


In [60]:
dataset.loc[:3]

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No


## Indexado y selección de datos - Parte II

In [76]:
r = dataset.sample(frac=0.1)
r.shape

(15000, 6)

In [83]:
dataset.head()

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No
4,5,Dallas,Male,46,50289.0,No


In [84]:
dataset[dataset['Age'].isin([41,42])]

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
2,3,Dallas,Male,42,52483.0,No
13,14,Dallas,Female,42,50894.0,No
35,36,Dallas,Male,41,50312.0,No
37,38,Dallas,Female,41,29538.0,No
...,...,...,...,...,...,...
149805,149806,Austin,Female,41,71121.0,No
149839,149840,Austin,Male,41,92839.0,No
149890,149891,Austin,Male,41,87493.0,No
149923,149924,Austin,Female,42,80570.0,No


In [85]:
dataset['Age'].where(dataset['Age'] == 41)

0         41.0
1          NaN
2          NaN
3          NaN
4          NaN
          ... 
149995     NaN
149996     NaN
149997     NaN
149998     NaN
149999     NaN
Name: Age, Length: 150000, dtype: float64

In [86]:
dataset['Age'].where(dataset['Age'] == 41, -1)

0         41
1         -1
2         -1
3         -1
4         -1
          ..
149995    -1
149996    -1
149997    -1
149998    -1
149999    -1
Name: Age, Length: 150000, dtype: int64

In [87]:
dataset.query('Age == 41')

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
35,36,Dallas,Male,41,50312.0,No
37,38,Dallas,Female,41,29538.0,No
41,42,Dallas,Male,41,68522.0,No
74,75,Dallas,Female,41,27897.0,No
...,...,...,...,...,...,...
149715,149716,Austin,Male,41,94296.0,No
149805,149806,Austin,Female,41,71121.0,No
149839,149840,Austin,Male,41,92839.0,No
149890,149891,Austin,Male,41,87493.0,No


In [88]:
dataset.query('City == "Dallas" and Age == 41')

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
35,36,Dallas,Male,41,50312.0,No
37,38,Dallas,Female,41,29538.0,No
41,42,Dallas,Male,41,68522.0,No
74,75,Dallas,Female,41,27897.0,No
...,...,...,...,...,...,...
19524,19525,Dallas,Female,41,32583.0,No
19543,19544,Dallas,Female,41,38234.0,No
19573,19574,Dallas,Male,41,56297.0,No
19594,19595,Dallas,Male,41,49911.0,No


In [89]:
dataset.query('index < 4')

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No


In [90]:
dataset.query('Age in (41,42)')

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
2,3,Dallas,Male,42,52483.0,No
13,14,Dallas,Female,42,50894.0,No
35,36,Dallas,Male,41,50312.0,No
37,38,Dallas,Female,41,29538.0,No
...,...,...,...,...,...,...
149805,149806,Austin,Female,41,71121.0,No
149839,149840,Austin,Male,41,92839.0,No
149890,149891,Austin,Male,41,87493.0,No
149923,149924,Austin,Female,42,80570.0,No


In [92]:
ciudad_buscada = "Dallas"

dataset.query(f'City == "{ciudad_buscada}"')

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No
4,5,Dallas,Male,46,50289.0,No
...,...,...,...,...,...,...
19702,19703,Dallas,Female,59,30021.0,No
19703,19704,Dallas,Male,33,34643.0,No
19704,19705,Dallas,Male,33,53190.0,No
19705,19706,Dallas,Male,37,54265.0,No


In [93]:
f'City == "{ciudad_buscada}"'

'City == "Dallas"'

In [95]:
dataset.drop_duplicates('City', inplace = False, keep = 'first')

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
19707,19708,New York City,Male,49,112226.0,No
70014,70015,Los Angeles,Male,34,102868.0,No
102187,102188,Mountain View,Male,31,150367.0,No
116406,116407,Boston,Female,57,87004.0,No
124707,124708,Washington D.C.,Male,38,62295.0,No
132827,132828,San Diego,Female,39,105138.0,No
137708,137709,Austin,Male,53,103971.0,No


## Multiindex

In [2]:
import numpy as np
import pandas as pd

In [3]:
s = pd.Series(np.arange(12))
s

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
dtype: int64

In [5]:
s.index.values

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [10]:
array = [['a','a','a','a','b','b','b','b','c','c','c','c'],
         ['a','b','c','d','a','b','c','d','a','b','c','d']]

t = list(zip(array[0],array[1]))
t

[('a', 'a'),
 ('a', 'b'),
 ('a', 'c'),
 ('a', 'd'),
 ('b', 'a'),
 ('b', 'b'),
 ('b', 'c'),
 ('b', 'd'),
 ('c', 'a'),
 ('c', 'b'),
 ('c', 'c'),
 ('c', 'd')]

In [11]:
mi = pd.MultiIndex.from_tuples(t, names=['primero','segundo'])
mi

MultiIndex([('a', 'a'),
            ('a', 'b'),
            ('a', 'c'),
            ('a', 'd'),
            ('b', 'a'),
            ('b', 'b'),
            ('b', 'c'),
            ('b', 'd'),
            ('c', 'a'),
            ('c', 'b'),
            ('c', 'c'),
            ('c', 'd')],
           names=['primero', 'segundo'])

In [12]:
s = pd.Series(np.arange(12), index = mi)
s

primero  segundo
a        a           0
         b           1
         c           2
         d           3
b        a           4
         b           5
         c           6
         d           7
c        a           8
         b           9
         c          10
         d          11
dtype: int64

In [48]:
s.loc[('a','b')]

1

In [50]:
s.loc['a']

segundo
a    0
b    1
c    2
d    3
dtype: int64

In [52]:
s.loc[:,'a']

primero
a    0
b    4
c    8
dtype: int64

In [13]:
from pathlib import Path
data_path = Path('./data')

In [14]:
dataset = pd.read_csv(data_path / 'dataset.csv', sep = ',')
dataset.head()

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No
4,5,Dallas,Male,46,50289.0,No


In [15]:
tuples = [('Columnas_no_importantes', 'Number'),
          ('Columnas_importantes', 'City'),
          ('Columnas_no_importantes', 'Gender'),
          ('Columnas_importantes', 'Age'),
          ('Columnas_importantes', 'Income'),
          ('Columnas_no_importantes', 'Illness')
         ]

multiindex = pd.MultiIndex.from_tuples(tuples)
multiindex

MultiIndex([('Columnas_no_importantes',  'Number'),
            (   'Columnas_importantes',    'City'),
            ('Columnas_no_importantes',  'Gender'),
            (   'Columnas_importantes',     'Age'),
            (   'Columnas_importantes',  'Income'),
            ('Columnas_no_importantes', 'Illness')],
           )

In [16]:
dataset.columns = multiindex
dataset.head()

Unnamed: 0_level_0,Columnas_no_importantes,Columnas_importantes,Columnas_no_importantes,Columnas_importantes,Columnas_importantes,Columnas_no_importantes
Unnamed: 0_level_1,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No
4,5,Dallas,Male,46,50289.0,No


In [18]:
dataset['Columnas_importantes'].head()

Unnamed: 0,City,Age,Income
0,Dallas,41,40367.0
1,Dallas,54,45084.0
2,Dallas,42,52483.0
3,Dallas,40,40941.0
4,Dallas,46,50289.0


In [23]:
dataset.columns.levels

FrozenList([['Columnas_importantes', 'Columnas_no_importantes'], ['Age', 'City', 'Gender', 'Illness', 'Income', 'Number']])

In [24]:
dataset.columns.get_level_values(0)

Index(['Columnas_no_importantes', 'Columnas_importantes',
       'Columnas_no_importantes', 'Columnas_importantes',
       'Columnas_importantes', 'Columnas_no_importantes'],
      dtype='object')

In [20]:
multiindex.get_level_values(1)

Index(['Number', 'City', 'Gender', 'Age', 'Income', 'Illness'], dtype='object')

In [28]:
dataset.loc[:,('Columnas_importantes','City')]

0         Dallas
1         Dallas
2         Dallas
3         Dallas
4         Dallas
           ...  
149995    Austin
149996    Austin
149997    Austin
149998    Austin
149999    Austin
Name: (Columnas_importantes, City), Length: 150000, dtype: object

In [26]:
dataset[('Columnas_importantes','City')]

0         Dallas
1         Dallas
2         Dallas
3         Dallas
4         Dallas
           ...  
149995    Austin
149996    Austin
149997    Austin
149998    Austin
149999    Austin
Name: (Columnas_importantes, City), Length: 150000, dtype: object

In [29]:
dataset.loc[:,[('Columnas_importantes','City'),('Columnas_importantes','Age')]]

Unnamed: 0_level_0,Columnas_importantes,Columnas_importantes
Unnamed: 0_level_1,City,Age
0,Dallas,41
1,Dallas,54
2,Dallas,42
3,Dallas,40
4,Dallas,46
...,...,...
149995,Austin,48
149996,Austin,25
149997,Austin,26
149998,Austin,25


In [33]:
_ = dataset.head(5).T
_

Unnamed: 0,Unnamed: 1,0,1,2,3,4
Columnas_no_importantes,Number,1,2,3,4,5
Columnas_importantes,City,Dallas,Dallas,Dallas,Dallas,Dallas
Columnas_no_importantes,Gender,Male,Male,Male,Male,Male
Columnas_importantes,Age,41,54,42,40,46
Columnas_importantes,Income,40367,45084,52483,40941,50289
Columnas_no_importantes,Illness,No,No,No,No,No


In [34]:
_.index

MultiIndex([('Columnas_no_importantes',  'Number'),
            (   'Columnas_importantes',    'City'),
            ('Columnas_no_importantes',  'Gender'),
            (   'Columnas_importantes',     'Age'),
            (   'Columnas_importantes',  'Income'),
            ('Columnas_no_importantes', 'Illness')],
           )

In [37]:
dataset.head()

Unnamed: 0_level_0,Columnas_no_importantes,Columnas_importantes,Columnas_no_importantes,Columnas_importantes,Columnas_importantes,Columnas_no_importantes
Unnamed: 0_level_1,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No
4,5,Dallas,Male,46,50289.0,No


In [45]:
dataset.sort_index(level=0, axis = 1).head()

Unnamed: 0_level_0,Columnas_importantes,Columnas_importantes,Columnas_importantes,Columnas_no_importantes,Columnas_no_importantes,Columnas_no_importantes
Unnamed: 0_level_1,Age,City,Income,Gender,Illness,Number
0,41,Dallas,40367.0,Male,No,1
1,54,Dallas,45084.0,Male,No,2
2,42,Dallas,52483.0,Male,No,3
3,40,Dallas,40941.0,Male,No,4
4,46,Dallas,50289.0,Male,No,5


In [46]:
dataset.sort_index(level=1, axis = 1).head()

Unnamed: 0_level_0,Columnas_importantes,Columnas_importantes,Columnas_no_importantes,Columnas_no_importantes,Columnas_importantes,Columnas_no_importantes
Unnamed: 0_level_1,Age,City,Gender,Illness,Income,Number
0,41,Dallas,Male,No,40367.0,1
1,54,Dallas,Male,No,45084.0,2
2,42,Dallas,Male,No,52483.0,3
3,40,Dallas,Male,No,40941.0,4
4,46,Dallas,Male,No,50289.0,5


## Timeseries y Timestamp

In [31]:
import pandas as pd
import numpy as np
import datetime

In [32]:
ts = pd.Timestamp(2020, 1, 1)
ts

Timestamp('2020-01-01 00:00:00')

In [33]:
ts = pd.Timestamp('2020-01-01')
ts

Timestamp('2020-01-01 00:00:00')

In [34]:
dt = pd.to_datetime(['01/01/2020', np.datetime64('2020-01-02'), datetime.datetime(2020, 1, 3)])
dt

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03'], dtype='datetime64[ns]', freq=None)

In [35]:
dt_int = pd.date_range('2020-01-01', periods=5, freq='D')
dt_int

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05'],
              dtype='datetime64[ns]', freq='D')

In [36]:
dt_int = pd.date_range(start='2020-01-01', end='2020-01-10', freq='D')
dt_int

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08',
               '2020-01-09', '2020-01-10'],
              dtype='datetime64[ns]', freq='D')

In [37]:
s = pd.Series(np.arange(10), index = dt_int)
s


2020-01-01    0
2020-01-02    1
2020-01-03    2
2020-01-04    3
2020-01-05    4
2020-01-06    5
2020-01-07    6
2020-01-08    7
2020-01-09    8
2020-01-10    9
Freq: D, dtype: int32

In [38]:
s.resample('2D').mean()

2020-01-01    0.5
2020-01-03    2.5
2020-01-05    4.5
2020-01-07    6.5
2020-01-09    8.5
Freq: 2D, dtype: float64

In [39]:
s.resample('1M').mean()

2020-01-31    4.5
Freq: M, dtype: float64

In [42]:
pd.to_datetime(pd.Series(['Aug 01, 2020', '2020-01-02']))

ValueError: time data "2020-01-02" doesn't match format "%b %d, %Y", at position 1. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [43]:
_ = pd.to_datetime('04/08/2020')
_.day

8

In [44]:
_ = pd.to_datetime('04/08/2020', dayfirst=True)
_.day

4

In [45]:
_ = pd.to_datetime('04/08/2020', format='%d/%m/%Y')
print(_)
_.day

2020-08-04 00:00:00


4

In [46]:
_ = pd.to_datetime('04/08/2020', format='%m/%d/%Y')
print(_)
_.day

2020-04-08 00:00:00


8

In [49]:
pd.to_datetime('30/02/2020', format='%d/%m/%Y')

ValueError: day is out of range for month, at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [97]:
pd.to_datetime('30/02/2020', format='%d/%m/%Y', errors = 'coerce')

NaT

In [98]:
pd.to_datetime([1349720105, 1349806505, 1349892905, 1349979305, 1350065705], unit='s')

DatetimeIndex(['2012-10-08 18:15:05', '2012-10-09 18:15:05',
               '2012-10-10 18:15:05', '2012-10-11 18:15:05',
               '2012-10-12 18:15:05'],
              dtype='datetime64[ns]', freq=None)

In [114]:
week_mask = 'Mon Tue Wed'

_ = pd.bdate_range('2020-01-01', '2020-01-30', freq='C', weekmask=week_mask)
_

DatetimeIndex(['2020-01-01', '2020-01-06', '2020-01-07', '2020-01-08',
               '2020-01-13', '2020-01-14', '2020-01-15', '2020-01-20',
               '2020-01-21', '2020-01-22', '2020-01-27', '2020-01-28',
               '2020-01-29'],
              dtype='datetime64[ns]', freq='C')

In [115]:
pd.to_datetime('2020-01-01').day_name()

'Wednesday'

In [6]:
week_mask = 'Mon Tue Wed'

dr = pd.bdate_range('2020-01-01', '2020-01-30', freq='C', weekmask=week_mask)
dr

DatetimeIndex(['2020-01-01', '2020-01-06', '2020-01-07', '2020-01-08',
               '2020-01-13', '2020-01-14', '2020-01-15', '2020-01-20',
               '2020-01-21', '2020-01-22', '2020-01-27', '2020-01-28',
               '2020-01-29'],
              dtype='datetime64[ns]', freq='C')

In [9]:
# isocalendar: Separa cada componente de la fecha
ywd = dr.isocalendar()
ywd

Unnamed: 0,year,week,day
2020-01-01,2020,1,3
2020-01-06,2020,2,1
2020-01-07,2020,2,2
2020-01-08,2020,2,3
2020-01-13,2020,3,1
2020-01-14,2020,3,2
2020-01-15,2020,3,3
2020-01-20,2020,4,1
2020-01-21,2020,4,2
2020-01-22,2020,4,3


In [11]:
dr.year

Int64Index([2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
            2020, 2020],
           dtype='int64')

In [13]:
dr.month

Int64Index([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype='int64')

In [14]:
dr.day

Int64Index([1, 6, 7, 8, 13, 14, 15, 20, 21, 22, 27, 28, 29], dtype='int64')

In [15]:
dr.hour

Int64Index([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='int64')

In [16]:
dr.dayofyear

Int64Index([1, 6, 7, 8, 13, 14, 15, 20, 21, 22, 27, 28, 29], dtype='int64')

In [17]:
dr.weekday

Int64Index([2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2], dtype='int64')

In [20]:
dr.day_name()

Index(['Wednesday', 'Monday', 'Tuesday', 'Wednesday', 'Monday', 'Tuesday',
       'Wednesday', 'Monday', 'Tuesday', 'Wednesday', 'Monday', 'Tuesday',
       'Wednesday'],
      dtype='object')

## Timedelta

In [26]:
import pandas as pd
import numpy as np
import datetime

In [27]:
td = pd.Timedelta('10 days')
td

Timedelta('10 days 00:00:00')

In [61]:
td = pd.Timedelta(10, unit='d')
td

Timedelta('10 days 00:00:00')

In [62]:
td = pd.Timedelta('10 days 2 hours 20 minutes')
td

Timedelta('10 days 02:20:00')

In [63]:
dt_int = pd.date_range(start='2020-01-01', end='2020-03-10', freq='W')
dt_int

DatetimeIndex(['2020-01-05', '2020-01-12', '2020-01-19', '2020-01-26',
               '2020-02-02', '2020-02-09', '2020-02-16', '2020-02-23',
               '2020-03-01', '2020-03-08'],
              dtype='datetime64[ns]', freq='W-SUN')

In [64]:
dt_int + td

DatetimeIndex(['2020-01-15 02:20:00', '2020-01-22 02:20:00',
               '2020-01-29 02:20:00', '2020-02-05 02:20:00',
               '2020-02-12 02:20:00', '2020-02-19 02:20:00',
               '2020-02-26 02:20:00', '2020-03-04 02:20:00',
               '2020-03-11 02:20:00', '2020-03-18 02:20:00'],
              dtype='datetime64[ns]', freq=None)

In [65]:
dt_int2 = pd.date_range(start='2020-01-10', end='2020-01-19', freq='D')
dt_int2

DatetimeIndex(['2020-01-10', '2020-01-11', '2020-01-12', '2020-01-13',
               '2020-01-14', '2020-01-15', '2020-01-16', '2020-01-17',
               '2020-01-18', '2020-01-19'],
              dtype='datetime64[ns]', freq='D')

In [66]:
diff = dt_int - dt_int2
diff

TimedeltaIndex(['-5 days',  '1 days',  '7 days', '13 days', '19 days',
                '25 days', '31 days', '37 days', '43 days', '49 days'],
               dtype='timedelta64[ns]', freq=None)

In [69]:
diff.min()

Timedelta('-5 days +00:00:00')

In [70]:
diff.median()

Timedelta('22 days 00:00:00')

In [71]:
diff.max()

Timedelta('49 days 00:00:00')

In [72]:
diff

TimedeltaIndex(['-5 days',  '1 days',  '7 days', '13 days', '19 days',
                '25 days', '31 days', '37 days', '43 days', '49 days'],
               dtype='timedelta64[ns]', freq=None)

In [81]:
s = pd.Series(np.arange(10), index = diff)
s

-5 days    0
1 days     1
7 days     2
13 days    3
19 days    4
25 days    5
31 days    6
37 days    7
43 days    8
49 days    9
dtype: int64

In [85]:
s['7 days']

2

In [86]:
s['7 days':'25 days']

7 days     2
13 days    3
19 days    4
25 days    5
dtype: int64