In [1]:
import numpy as np
import pandas as pd

In [6]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data.index

RangeIndex(start=0, stop=4, step=1)

In [8]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=["a","b","c","d"])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [11]:
population_dict = {"California" : 38332521,
                   "Texas" : 26448193,
                   "New York" : 19651127,
                   "Florida" : 19552860,
                   "Illinois" : 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [12]:
pd.Series([2,4,6])

0    2
1    4
2    6
dtype: int64

In [13]:
pd.Series(5, index=[100,200,300])

100    5
200    5
300    5
dtype: int64

In [14]:
pd.Series({2:"a",1:"b",3:"c"})

2    a
1    b
3    c
dtype: object

In [17]:
area_dict = {"California": 423967, "Texas" : 695662, "New York" : 141297, "Florida" : 170312, "Illinois" : 149995}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [18]:
states = pd.DataFrame({"population" : population, "area" : area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [23]:
pd.DataFrame([{"a" : 1, "b" : 2},{"b":3,"c":4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [24]:
area = pd.Series({
'California': 423967, 
'Texas': 695662,
'New York': 141297,
'Florida': 170312,
'Illinois': 149995})

In [25]:
pop = pd.Series({
'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135})

In [26]:
data = pd.DataFrame({"area" : area, "pop" : pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [31]:
print(data["area"],end="\n\n\n")
print(data.area)

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


In [32]:
data.area is data["area"]

True

In [33]:
data["density"] = data["pop"] / data["area"]
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [34]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [35]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [36]:
data.values[0]

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])

In [50]:
data.iloc[:3, :2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [44]:
data.loc[:"Illinois", :"pop"]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [41]:
data.loc[data.density > 100, ["pop", "density"]]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [48]:
data.iloc[0,2] = 90
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.0
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


**while index‐ing refers to columns, slicing refers to rows**

In [60]:
print(data["Florida":"Illinois"])
print("\n")
print(data[1:3])
print("\n")
print(data[data.density > 100])

            area       pop     density
Florida   170312  19552860  114.806121
Illinois  149995  12882135   85.883763


            area       pop     density
Texas     695662  26448193   38.018740
New York  141297  19651127  139.076746


            area       pop     density
New York  141297  19651127  139.076746
Florida   170312  19552860  114.806121


nan

In [61]:
data = pd.Series([1, np.nan, 2, None, 3], index = list("abcde"))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [62]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [64]:
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [65]:
data.fillna(method="bfill")

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [68]:
df = pd.DataFrame([[1, np.nan, 2],
[2,3,5],
[np.nan, 4, 6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [69]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [70]:
df.dropna(axis="columns")

Unnamed: 0,2
0,2
1,5
2,6


In [71]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [72]:
df.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [75]:
df.dropna(axis= "rows", thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


In [76]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [77]:
df.fillna(method="ffill", axis=1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


Hierarchical Indexing

In [80]:
index = [('California', 2000), ('California', 2010),
 ('New York', 2000), ('New York', 2010),
 ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
18976457, 19378102,
20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [81]:
pop[("California", 2010):("Texas", 2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

the bad way

In [82]:
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

the better way

In [86]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [87]:
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [88]:
pop[:, 2010]
#149

California    37253956
New York      19378102
Texas         25145561
dtype: int64