In [2]:
import pandas as pd
data = pd.Series([0.25,0.5,0.75,1.0],
                index = ['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [3]:
data['b']

0.5

In [4]:
'a' in data

True

In [5]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [6]:
list(data.items())


[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [7]:
data['e']  =1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [8]:
# slicing by explicit index
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [9]:
#masking

In [10]:
#masking
data[(data>0.3) &(data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [11]:
#fancing indexing
data[['a','e']]

a    0.25
e    1.25
dtype: float64

In [12]:
data = pd.Series(['a','b','c'], index = [1,3,5])
data

1    a
3    b
5    c
dtype: object

In [13]:
#explicit index when indexing
data[1]

'a'

In [14]:
#implicit index when slicing 
data[1:3]

3    b
5    c
dtype: object

In [15]:
data.loc[1]

'a'

In [16]:
data.loc[1:3]

1    a
3    b
dtype: object

In [17]:
data.iloc[1]

'b'

In [18]:
data.iloc[1:3]

3    b
5    c
dtype: object

In [20]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                 'New York': 141297, 'Florida': 170312,
                 'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
 'New York': 19651127, 'Florida': 19552860,
 'Illinois': 12882135})
data = pd.DataFrame({'area':area,'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [21]:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [22]:
data.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [23]:
data.area is data['area']

True

In [24]:
data.pop is data['pop']

False

In [25]:
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [26]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [27]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [28]:
data.values[0]

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])

In [29]:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [32]:
data.iloc[:3,:2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [33]:
data.loc[:'Illinois', : 'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [37]:
data.ix[:3, :'pop']


AttributeError: 'DataFrame' object has no attribute 'ix'

In [38]:
data.loc[data.density > 100, ['pop', 'density']]


Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [39]:
data.iloc[0,2] = 90

In [40]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.0
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [42]:
data['Florida' :'Illinois']

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [43]:
data[1:3]

Unnamed: 0,area,pop,density
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746


In [44]:
data[data.density>100]

Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


In [45]:
import pandas as pd
import numpy as np

In [46]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0,10,4))
ser

0    6
1    3
2    7
3    4
dtype: int32

In [47]:
df = pd.DataFrame(rng.randint(0,10,(3,4)),
                 columns = ['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [48]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [49]:
np.sin(df * np.pi /4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


In [50]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
 'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
 'New York': 19651127}, name='population')


In [51]:
population /area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [52]:
area.index | population.index

  area.index | population.index


Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [53]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [54]:
A.add(B, fill_value=0)


0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [56]:
A = pd.DataFrame(rng.randint(0,20,(2,2)),
                columns = list('AB'))
A

Unnamed: 0,A,B
0,0,11
1,11,16


In [5]:
import numpy as np
import pandas as pd
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0,10,4))
ser

0    6
1    3
2    7
3    4
dtype: int32

In [6]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [7]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [8]:
np.sin(df * np.pi / 4)


Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


In [9]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
    'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
     'New York': 19651127}, name='population')


In [10]:
population /area 

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [11]:
area.index | population.index

  area.index | population.index


Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [13]:
area.index.union(population.index)

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [14]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [16]:
A.add(B,fill_value = 0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [17]:
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                columns=list('AB'))
A

Unnamed: 0,A,B
0,1,11
1,5,1


In [18]:
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,4,0,9
1,5,8,0
2,9,2,6


In [19]:
A + B


Unnamed: 0,A,B,C
0,1.0,15.0,
1,13.0,6.0,
2,,,


In [20]:
fill = A.stack().mean()
A.add(B,fill_value = fill)

Unnamed: 0,A,B,C
0,1.0,15.0,13.5
1,13.0,6.0,4.5
2,6.5,13.5,10.5


In [21]:
A = rng.randint(10, size=(3, 4))
A
Z = rng.randint(5, size=(3, 4))
Z

array([[3, 1, 1, 0],
       [1, 4, 1, 3],
       [3, 3, 3, 4]])

In [22]:
A - A[0]

array([[ 0,  0,  0,  0],
       [-1, -2,  2,  4],
       [ 3, -7,  1,  4]])

In [28]:
df = pd.DataFrame(A,columns = list('QRST'))
print (df)
print (df.iloc)
df - df.iloc[0]

   Q  R  S  T
0  3  8  2  4
1  2  6  4  8
2  6  1  3  8
<pandas.core.indexing._iLocIndexer object at 0x000001E6625E7900>


Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,-1,-2,2,4
2,3,-7,1,4


In [26]:
df.subtract(df['R'], axis = 0)

Unnamed: 0,Q,R,S,T
0,-5,0,-6,-4
1,-4,0,-2,2
2,5,0,2,7


In [29]:
halfrow = df.iloc[0, ::2]
halfrow

Q    3
S    2
Name: 0, dtype: int32

In [30]:
df - halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,-1.0,,2.0,
2,3.0,,1.0,


In [31]:
import numpy as np
import pandas as pd


In [32]:
vals1 = np.array([1,None,3,4])
vals1

array([1, None, 3, 4], dtype=object)

In [33]:
for dtype in ['object', 'int']:
    print("dtype =" , dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()
    print()

dtype = object
299 ms ± 24.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

dtype = int
7.42 ms ± 78 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)



In [34]:
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [35]:
vals2 = np.array([1,np.nan,3,4])
vals2.dtype

dtype('float64')

In [36]:
1 + np.nan

nan

In [37]:
0 * np.nan

nan

In [38]:
vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)

In [39]:
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)


(8.0, 1.0, 4.0)

In [40]:
pd.Series([1,np.nan,2,None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [41]:
x = pd.Series(range(2),dtype = int)
x

0    0
1    1
dtype: int32

In [42]:
data = pd.Series([1,np.nan,'hello',None])

In [43]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [44]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [45]:
data.dropna()

0        1
2    hello
dtype: object

In [46]:
df = pd.DataFrame([[1,np.nan,2],
                 [2,  3,     5],
                 [np.nan,4,  6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [47]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [48]:
df.dropna(axis ='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [49]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [50]:
df.dropna(axis = 'columns',how = 'all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [52]:
df.dropna(axis = 'rows',thresh  =3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


In [54]:
data = pd.Series([1,np.nan,2,None,3],index = list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [55]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [62]:
#foward-fill
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [59]:
#back-filll
data.fillna(method = 'bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [63]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [64]:
df.fillna(method ='ffill',axis = 1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


In [65]:
import pandas as pd
import numpy as np

In [66]:
#the bad way
index = [('California', 2000), ('California', 2010),
        ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
             18976457, 19378102,
             20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [68]:
pop[('California',2010) : ('Texas', 2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [69]:
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [72]:
#the better way
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [73]:
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [75]:
pop[: , 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [76]:
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [77]:
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [78]:
pop_df = pd.DataFrame({'total':pop,
                      'under18': [926789,9284694,
                                 4687374,4318033,
                                 5906301,6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,926789
California,2010,37253956,9284694
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [79]:
df = pd.DataFrame(np.random.rand(4,2),
                 index = [['a','a','b', 'b'],[1,2,1,2]],
                 columns = ['data1','data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.46048,0.974922
a,2,0.882727,0.914559
b,1,0.154623,0.746279
b,2,0.661343,0.002926


In [80]:
data = {('California', 2000): 33871648,
         ('California', 2010): 37253956,
     ('Texas', 2000): 20851820,
     ('Texas', 2010): 25145561,
     ('New York', 2000): 18976457,
     ('New York', 2010): 19378102}
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

In [81]:
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])


MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [83]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [84]:
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [86]:
pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
 labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

TypeError: __new__() got an unexpected keyword argument 'labels'

In [87]:
pop.index.names = ['state','year']
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [90]:
# heirarchical indices and columns
index = pd.MultiIndex.from_product([[2013,2014],[1,2]],
                                  names = ['year' , 'vist'])
columns = pd.MultiIndex.from_product([['Bob','Guido', 'Sue'], ['HR', 'Temp']],
                                        names = ['subject' , 'type'])

# mock some data
data = np.round(np.random.randn(4,6),1)
data[:,::2] *=10
data += 37


#create the DataFrame
health_data = pd.DataFrame(data, index = index,columns = columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,vist,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,51.0,34.9,32.0,37.1,44.0,37.3
2013,2,46.0,37.8,29.0,36.1,49.0,37.4
2014,1,28.0,34.7,37.0,37.4,34.0,37.4
2014,2,35.0,37.8,24.0,37.5,40.0,38.2


In [92]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,vist,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,32.0,37.1
2013,2,29.0,36.1
2014,1,37.0,37.4
2014,2,24.0,37.5


In [93]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [94]:
pop['California' , 2000]

33871648

In [95]:
pop['California']

year
2000    33871648
2010    37253956
dtype: int64

In [100]:
pop['California' : 'New York']

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [102]:
pop[:,2000]

state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [103]:
pop[['California', 'Texas']]

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

In [104]:
pop[pop >22000000]

state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [105]:
health_data


Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,vist,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,51.0,34.9,32.0,37.1,44.0,37.3
2013,2,46.0,37.8,29.0,36.1,49.0,37.4
2014,1,28.0,34.7,37.0,37.4,34.0,37.4
2014,2,35.0,37.8,24.0,37.5,40.0,38.2


In [107]:
health_data['Guido' ,'HR']

year  vist
2013  1       32.0
      2       29.0
2014  1       37.0
      2       24.0
Name: (Guido, HR), dtype: float64

In [111]:
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,vist,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,51.0,34.9
2013,2,46.0,37.8


In [112]:
health_data.loc[:,('Bob', 'HR')]

year  vist
2013  1       51.0
      2       46.0
2014  1       28.0
      2       35.0
Name: (Bob, HR), dtype: float64

In [113]:
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]


Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,vist,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,51.0,32.0,44.0
2014,1,28.0,37.0,34.0


In [114]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data


char  int
a     1      0.887431
      2      0.888680
c     1      0.784430
      2      0.238775
b     1      0.137017
      2      0.720737
dtype: float64

In [116]:
try:
    data['a' : 'b']
except KeyError as e:
    print(type(e))
    print(e)

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [117]:
data = data.sort_index()
data

char  int
a     1      0.887431
      2      0.888680
b     1      0.137017
      2      0.720737
c     1      0.784430
      2      0.238775
dtype: float64

In [118]:
data['a':'b']

char  int
a     1      0.887431
      2      0.888680
b     1      0.137017
      2      0.720737
dtype: float64

In [119]:
pop


state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [120]:
pop.unstack(level = 0)

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [121]:
pop.unstack(level =1)

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [122]:
pop.unstack().stack()

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [123]:
pop_flat= pop.reset_index(name = 'population')
pop_flat

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [124]:
pop_flat.set_index(['state', 'year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [125]:
data_mean = health_data.mean(level='year')
data_mean

  data_mean = health_data.mean(level='year')


subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,48.5,36.35,30.5,36.6,46.5,37.35
2014,31.5,36.25,30.5,37.45,37.0,37.8


In [126]:
data_mean.mean(axis=1, level='type')


  data_mean.mean(axis=1, level='type')


type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,41.833333,36.766667
2014,33.0,37.166667


In [127]:
import pandas as pd
import numpy as np

In [128]:
def make_df(cols,ind):
    """Quickly make a DataFrame"""
    data = {c:[str(c) +str(i) for i in ind]
           for c in cols}
    return pd.DataFrame(data,ind)

#example DataFrame
make_df('ABC',range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [129]:
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]
np.concatenate([x, y, z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [132]:
x = [[1,2], [3,4]]
np.concatenate([x,x],axis = 1)

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

In [133]:
# Signature in Pandas v0.18
pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
 keys=None, levels=None, names=None, verify_integrity=False,
 copy=True)


NameError: name 'objs' is not defined

In [134]:
ser1 = pd.Series(['A','B','C'], index = [1,2,3])
ser2 = pd.Series(['D','E','F'], index = [4,5,6])
pd.concat([ser1,ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [3]:
import pandas as pd
import numpy as np

In [5]:
df1 = pd.DataFrame({'employee' :['Bob','Jake','Lisa','Sue'],
                   'group': ['Accounting','Engineering','Engineering','HR']})
df2 = pd.DataFrame({'employee' : ['Lisa', 'Bob','Jake','Sue'],
                   'hire_date' : [2004,2008,2012,2014]})
print(df1);print(df2)

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014


In [6]:
df3 = pd.merge(df1,df2)
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [7]:
df4 = pd.DataFrame({'group' :['Accounting' , 'Engineering' , 'HR'],
                   'supervisor' : ['Carly', 'Guido', 'Steve']})
print(df3); print(df4);print(pd.merge(df3,df4))

  employee        group  hire_date
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014
         group supervisor
0   Accounting      Carly
1  Engineering      Guido
2           HR      Steve
  employee        group  hire_date supervisor
0      Bob   Accounting       2008      Carly
1     Jake  Engineering       2012      Guido
2     Lisa  Engineering       2004      Guido
3      Sue           HR       2014      Steve


In [8]:
df5 = pd.DataFrame({'group': ['Accounting','Accounting',
                             'Engineering','Engineering','HR','HR'],
                   'skills' : ['math','spreadsheets','coding','linux',
                              'spreadsheets','organization']})
print(df1);print(df5);print(pd.merge(df1,df5))

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
         group        skills
0   Accounting          math
1   Accounting  spreadsheets
2  Engineering        coding
3  Engineering         linux
4           HR  spreadsheets
5           HR  organization
  employee        group        skills
0      Bob   Accounting          math
1      Bob   Accounting  spreadsheets
2     Jake  Engineering        coding
3     Jake  Engineering         linux
4     Lisa  Engineering        coding
5     Lisa  Engineering         linux
6      Sue           HR  spreadsheets
7      Sue           HR  organization


In [10]:
print(df1);print(df2);print(pd.merge(df1,df2, on ='employee'))

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014
  employee        group  hire_date
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014


In [13]:
df3 = pd.DataFrame({'name': [ 'Bob','Jake', 'Lisa','Sue'],
             'salary': [70000,80000,120000,90000]})
print(df1); print(df3);
print(pd.merge(df1,df3,left_on = "employee", right_on = "name"))

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
   name  salary
0   Bob   70000
1  Jake   80000
2  Lisa  120000
3   Sue   90000
  employee        group  name  salary
0      Bob   Accounting   Bob   70000
1     Jake  Engineering  Jake   80000
2     Lisa  Engineering  Lisa  120000
3      Sue           HR   Sue   90000


In [14]:
pd.merge(df1,df3,left_on = "employee" , right_on = "name").drop('name',axis = 1)

Unnamed: 0,employee,group,salary
0,Bob,Accounting,70000
1,Jake,Engineering,80000
2,Lisa,Engineering,120000
3,Sue,HR,90000


In [15]:
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
print(df1a); print (df2a)

                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
          hire_date
employee           
Lisa           2004
Bob            2008
Jake           2012
Sue            2014


In [17]:
print(df1a);print(df2a)
print(pd.merge(df1a,df2a,left_index = True,right_index = True))

                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
          hire_date
employee           
Lisa           2004
Bob            2008
Jake           2012
Sue            2014
                group  hire_date
employee                        
Bob        Accounting       2008
Jake      Engineering       2012
Lisa      Engineering       2004
Sue                HR       2014


In [18]:
print(df1); print(df2); print(df1a.join(df2a))

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014
                group  hire_date
employee                        
Bob        Accounting       2008
Jake      Engineering       2012
Lisa      Engineering       2004
Sue                HR       2014


In [19]:
print(df1a); print(df3);
print(pd.merge(df1a,df3,left_index  =True, right_on = "name"))

                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
   name  salary
0   Bob   70000
1  Jake   80000
2  Lisa  120000
3   Sue   90000
         group  name  salary
0   Accounting   Bob   70000
1  Engineering  Jake   80000
2  Engineering  Lisa  120000
3           HR   Sue   90000


In [20]:
df6 = pd.DataFrame({'name' : ['Peter','Paul','Mary'],
                   'food':['fish', 'beans', 'bread']},
                  columns = ['name' , 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
                   'drink' : ['wine', 'beer']},
                  columns = ['name', 'drink'])
print(df6);print(df7);print(pd.merge(df6,df7))

    name   food
0  Peter   fish
1   Paul  beans
2   Mary  bread
     name drink
0    Mary  wine
1  Joseph  beer
   name   food drink
0  Mary  bread  wine


In [21]:
pd.merge(df6,df7,how= 'inner')

Unnamed: 0,name,food,drink
0,Mary,bread,wine


In [22]:
print(df6); print (df7); print(pd.merge(df6,df7,how = 'outer'))

    name   food
0  Peter   fish
1   Paul  beans
2   Mary  bread
     name drink
0    Mary  wine
1  Joseph  beer
     name   food drink
0   Peter   fish   NaN
1    Paul  beans   NaN
2    Mary  bread  wine
3  Joseph    NaN  beer


In [23]:
print(df6) ; print(df7); print(pd.merge(df6,df7,how= "left"))

    name   food
0  Peter   fish
1   Paul  beans
2   Mary  bread
     name drink
0    Mary  wine
1  Joseph  beer
    name   food drink
0  Peter   fish   NaN
1   Paul  beans   NaN
2   Mary  bread  wine


In [24]:
df8 = pd.DataFrame({'name': ['Bob','Jake', 'Lisa','Sue'],
                    'rank' : [1,2,3,4]})
df9 = pd.DataFrame({'name' : ['Bob' , 'Jake','Lisa' , 'Sue'],
                   'rank' : [3,1,4,2]})
print(df8); print(df9); print(pd.merge(df8,df9,on = "name"))

   name  rank
0   Bob     1
1  Jake     2
2  Lisa     3
3   Sue     4
   name  rank
0   Bob     3
1  Jake     1
2  Lisa     4
3   Sue     2
   name  rank_x  rank_y
0   Bob       1       3
1  Jake       2       1
2  Lisa       3       4
3   Sue       4       2


In [25]:
print(df8);print(df9);
print(pd.merge(df8,df9, on= "name" , suffixes = ["_L", "_R"]))

   name  rank
0   Bob     1
1  Jake     2
2  Lisa     3
3   Sue     4
   name  rank
0   Bob     3
1  Jake     1
2  Lisa     4
3   Sue     2
   name  rank_L  rank_R
0   Bob       1       3
1  Jake       2       1
2  Lisa       3       4
3   Sue       4       2


In [29]:
pop = pd.read_csv('state-population.csv')
areas = pd.read_csv('state-areas.csv')
abbrevs = pd.read_csv('state-abbrevs.csv')
print(pop.head()); print(areas.head()); print(abbrevs.head())


  state/region     ages  year  population
0           AL  under18  2012   1117489.0
1           AL    total  2012   4817528.0
2           AL  under18  2010   1130966.0
3           AL    total  2010   4785570.0
4           AL  under18  2011   1125763.0
        state  area (sq. mi)
0     Alabama          52423
1      Alaska         656425
2     Arizona         114006
3    Arkansas          53182
4  California         163707
        state abbreviation
0     Alabama           AL
1      Alaska           AK
2     Arizona           AZ
3    Arkansas           AR
4  California           CA


In [31]:
merged = pd.merge(pop,abbrevs, how = 'outer',
                 left_on= 'state/region',right_on = 'abbreviation')
merged = merged.drop('abbreviation', 1) # drop dupplicate info
merged.head()

  merged = merged.drop('abbreviation', 1) # drop dupplicate info


Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama


In [32]:
merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state            True
dtype: bool

In [33]:
merged[merged['population'].isnull()].head()

Unnamed: 0,state/region,ages,year,population,state
2448,PR,under18,1990,,
2449,PR,total,1990,,
2450,PR,total,1991,,
2451,PR,under18,1991,,
2452,PR,total,1993,,


In [34]:
merged.loc[merged['state'].isnull(),'state/region'].unique()

array(['PR', 'USA'], dtype=object)

In [35]:
merged.loc[merged['state/region'] == "PR", 'state'] = 'Puerto Rico'
merged.loc[merged['state/region'] == 'USA' , 'state'] = 'United States'

In [36]:
merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state           False
dtype: bool

In [37]:
final = pd.merge(merged,areas, on = 'state' , how= 'left')
final.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423.0
1,AL,total,2012,4817528.0,Alabama,52423.0
2,AL,under18,2010,1130966.0,Alabama,52423.0
3,AL,total,2010,4785570.0,Alabama,52423.0
4,AL,under18,2011,1125763.0,Alabama,52423.0


In [38]:
final.isnull().any()

state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)     True
dtype: bool

In [39]:
final['state'][final['area (sq. mi)'].isnull()].unique()

array(['United States'], dtype=object)

In [41]:
final.dropna(inplace = True)
final.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423.0
1,AL,total,2012,4817528.0,Alabama,52423.0
2,AL,under18,2010,1130966.0,Alabama,52423.0
3,AL,total,2010,4785570.0,Alabama,52423.0
4,AL,under18,2011,1125763.0,Alabama,52423.0


In [47]:
data2010 = final.query("year == 2010 & ages == 'total'")
data2010.head()


Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
3,AL,total,2010,4785570.0,Alabama,52423.0
91,AK,total,2010,713868.0,Alaska,656425.0
101,AZ,total,2010,6408790.0,Arizona,114006.0
189,AR,total,2010,2922280.0,Arkansas,53182.0
197,CA,total,2010,37333601.0,California,163707.0


In [48]:
data2010.set_index('state', inplace=True)
density = data2010['population'] / data2010['area (sq. mi)']
In[32]: density.sort_values(ascending=False, inplace=True)
density.head()

state
District of Columbia    8898.897059
Puerto Rico             1058.665149
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
dtype: float64

In [49]:
density.tail()

state
South Dakota    10.583512
North Dakota     9.537565
Montana          6.736171
Wyoming          5.768079
Alaska           1.087509
dtype: float64

In [52]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64

In [53]:
ser.sum()

2.811925491708157

In [54]:
ser.mean()

0.5623850983416314

In [58]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
 'data': range(6)}, columns=['key', 'data'])
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [59]:
df.groupby('key')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001C294EB14C0>

In [60]:
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [61]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                 'data1': range(6),
                 'data2': rng.randint(0, 10, 6)},
                 columns = ['key', 'data1', 'data2'])
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [62]:
df.groupby('key').aggregate(['min',np.median,max])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,3,4.0,5
B,1,2.5,4,0,3.5,7
C,2,3.5,5,3,6.0,9


In [63]:
df.groupby('key').aggregate({'data1' : 'min',
                            'data2' : 'max'})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,7
C,2,9


In [64]:
def filter_func(x):
    return x['data2'].std() >4

print(df);print(df.groupby('key').std());
print(df.groupby('key').filter(filter_func))
               

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
       data1     data2
key                   
A    2.12132  1.414214
B    2.12132  4.949747
C    2.12132  4.242641
  key  data1  data2
1   B      1      0
2   C      2      3
4   B      4      7
5   C      5      9


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
titanic = sns.load_dataset('titanic')


URLError: <urlopen error [Errno 11001] getaddrinfo failed>

In [2]:
titanic.head()


NameError: name 'titanic' is not defined

In [3]:
import numpy as np
X = np.array([2,3,5,7,11,13])
X * 2

array([ 4,  6, 10, 14, 22, 26])

In [5]:
data = ['peter', 'Paul', 'MARY', 'gUIDO']
[s.capitalize() for s in data]

['Peter', 'Paul', 'Mary', 'Guido']

In [6]:
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
[s.capitalize() for s in data]

AttributeError: 'NoneType' object has no attribute 'capitalize'

In [7]:
import pandas as pd
names = pd.Series(data)
names

0    peter
1     Paul
2     None
3     MARY
4    gUIDO
dtype: object

In [8]:
names.str.capitalize()

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

In [9]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
 'Eric Idle', 'Terry Jones', 'Michael Palin'])


In [10]:
monte

0    Graham Chapman
1       John Cleese
2     Terry Gilliam
3         Eric Idle
4       Terry Jones
5     Michael Palin
dtype: object

In [11]:
monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [13]:
monte.str.len()

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [14]:
monte.str.startswith('T')

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [16]:
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

In [17]:
monte.str.extract('([A-Za-z]+)')

Unnamed: 0,0
0,Graham
1,John
2,Terry
3,Eric
4,Terry
5,Michael


In [18]:
monte.str.findall(r'^[^AEIOU].*[^aeiou]$')

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

In [20]:
monte.str[0:3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [21]:
monte.str.split().str.get(-1)

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

In [22]:
full_monte = pd.DataFrame({'name': monte,
 'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C',
 'B|C|D']})
full_monte

Unnamed: 0,name,info
0,Graham Chapman,B|C|D
1,John Cleese,B|D
2,Terry Gilliam,A|C
3,Eric Idle,B|D
4,Terry Jones,B|C
5,Michael Palin,B|C|D


In [23]:
full_monte['info'].str.get_dummies('|')

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


In [24]:
spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley',
 'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin']

In [25]:
import re
spice_df = pd.DataFrame(
         dict((spice, recipes.ingredients.str.contains(spice, re.IGNORECASE))
                     for spice in spice_list))
spice_df.head()

NameError: name 'recipes' is not defined

In [30]:
import datetime as dailytime

In [34]:
from datetime import datetime
datetime(year=2015, month=7, day=4)

datetime.datetime(2015, 7, 4, 0, 0)

In [36]:
from dateutil import parser
date = parser.parse("2nd of December, 2021")
date

datetime.datetime(2021, 12, 2, 0, 0)

In [37]:
date.strftime('%A')

'Thursday'

In [39]:
import numpy as np
date = np.array('2021-12-03',dtype = np.datetime64)
date

array('2021-12-03', dtype='datetime64[D]')

In [40]:
date + np.arange(12)

array(['2021-12-03', '2021-12-04', '2021-12-05', '2021-12-06',
       '2021-12-07', '2021-12-08', '2021-12-09', '2021-12-10',
       '2021-12-11', '2021-12-12', '2021-12-13', '2021-12-14'],
      dtype='datetime64[D]')

In [41]:
np.datetime64('2015-07-04')


numpy.datetime64('2015-07-04')

In [42]:
np.datetime64('2015-07-04 12:00')


numpy.datetime64('2015-07-04T12:00')

In [43]:
np.datetime64('2015-07-04 12:59:59.50', 'ns')


numpy.datetime64('2015-07-04T12:59:59.500000000')

In [44]:
import pandas as pd
date = pd.to_datetime("4th of July, 2015")
date

Timestamp('2015-07-04 00:00:00')

In [45]:
date.strftime('%A')

'Saturday'

In [46]:
date + pd.to_timedelta(np.arange(12),'D')

DatetimeIndex(['2015-07-04', '2015-07-05', '2015-07-06', '2015-07-07',
               '2015-07-08', '2015-07-09', '2015-07-10', '2015-07-11',
               '2015-07-12', '2015-07-13', '2015-07-14', '2015-07-15'],
              dtype='datetime64[ns]', freq=None)

In [47]:
index = pd.DatetimeIndex(['2014-07-04', '2014-08-04',
                         '2015-07-04', '2015-08-04'])
data = pd.Series([0, 1, 2, 3], index=index)
data


2014-07-04    0
2014-08-04    1
2015-07-04    2
2015-08-04    3
dtype: int64

In [48]:
data['2014-07-04':'2015-07-04']

2014-07-04    0
2014-08-04    1
2015-07-04    2
dtype: int64

In [49]:
data['2015']

2015-07-04    2
2015-08-04    3
dtype: int64

In [50]:
dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July, 2015',
                     '2015-Jul-6', '07-07-2015', '20150708'])
dates

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
               '2015-07-08'],
              dtype='datetime64[ns]', freq=None)

In [51]:
dates.to_period('D')


PeriodIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
             '2015-07-08'],
            dtype='period[D]')

In [52]:
dates - dates[0]


TimedeltaIndex(['0 days', '1 days', '3 days', '4 days', '5 days'], dtype='timedelta64[ns]', freq=None)

In [53]:
pd.date_range('2015-07-03', '2015-07-10')

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')

In [54]:
pd.date_range('2015-07-03', periods=8)

DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-05', '2015-07-06',
               '2015-07-07', '2015-07-08', '2015-07-09', '2015-07-10'],
              dtype='datetime64[ns]', freq='D')

In [55]:
pd.date_range('2015-07-03', periods=8, freq='H')

DatetimeIndex(['2015-07-03 00:00:00', '2015-07-03 01:00:00',
               '2015-07-03 02:00:00', '2015-07-03 03:00:00',
               '2015-07-03 04:00:00', '2015-07-03 05:00:00',
               '2015-07-03 06:00:00', '2015-07-03 07:00:00'],
              dtype='datetime64[ns]', freq='H')

In [56]:
pd.period_range('2015-07', periods=8, freq='M')

PeriodIndex(['2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12',
             '2016-01', '2016-02'],
            dtype='period[M]')

In [57]:
pd.timedelta_range(0, periods=10, freq='H')

TimedeltaIndex(['0 days 00:00:00', '0 days 01:00:00', '0 days 02:00:00',
                '0 days 03:00:00', '0 days 04:00:00', '0 days 05:00:00',
                '0 days 06:00:00', '0 days 07:00:00', '0 days 08:00:00',
                '0 days 09:00:00'],
               dtype='timedelta64[ns]', freq='H')

In [1]:
import numpy as np


In [None]:
np.arange(12)