## 5.1 Introduction to pandas Data Structures

In [1]:
import numpy as np
import pandas as pd

### Series
A Series is a one-dimensional array-like object containing a sequence of values, of the same type and an associated array of data labels, called its index.

In [2]:
obj = pd.Series([4, 7, 9, -3])
obj

0    4
1    7
2    9
3   -3
dtype: int64

In [3]:
obj.array

<PandasArray>
[4, 7, 9, -3]
Length: 4, dtype: int64

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
obj2 = pd.Series([4, 7, 9, -3], index=["d", "c", "a", "b"])
obj2

d    4
c    7
a    9
b   -3
dtype: int64

In [6]:
obj2.index

Index(['d', 'c', 'a', 'b'], dtype='object')

In [7]:
obj2["a"]

9

In [8]:
obj2["d"] = 6
obj2[["c", "a", "d"]]

c    7
a    9
d    6
dtype: int64

In [9]:
obj2[obj2 > 2]

d    6
c    7
a    9
dtype: int64

In [10]:
obj2 * 2

d    12
c    14
a    18
b    -6
dtype: int64

In [11]:
np.exp(obj2)

d     403.428793
c    1096.633158
a    8103.083928
b       0.049787
dtype: float64

In [12]:
"b" in obj2

True

In [13]:
"e" in obj2

False

In [14]:
Sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah":5000}
obj3 = pd.Series(Sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [15]:
obj3.to_dict()

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [16]:
states = ["California", "Ohio", "Oregon", "Texas"]
obj4 = pd.Series(Sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [17]:
pd.isna(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [18]:
pd.notna(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [19]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [20]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [21]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [22]:
obj4.name = "Population"
obj4.index.name = "States"
obj4

States
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: Population, dtype: float64

In [23]:
obj

0    4
1    7
2    9
3   -3
dtype: int64

In [24]:
obj.index = ["Bob", "Steve", "Jeff", "Ryan"]
obj

Bob      4
Steve    7
Jeff     9
Ryan    -3
dtype: int64

### DataFrame
A DataFrame represents a rectangular table of data and contains an ordered, named collection of columns, each of which can be a different value type.

In [25]:
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


For larger dataframes use .head() to select the first five rows

In [26]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


And use .tail() to select the last five rows

In [27]:
frame.tail()

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


Specified a sequence of columns and the data frame will be orded:

In [28]:
pd.DataFrame(data, columns=["year", "state", "pop"])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


If a columns specified is not contained, the NaN will be returned from the dataframe:

In [29]:
frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [30]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [31]:
frame2["state"]

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [32]:
frame2.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [33]:
frame2.loc[1]

year     2001
state    Ohio
pop       1.7
debt      NaN
Name: 1, dtype: object

In [34]:
frame2.iloc[2]

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: 2, dtype: object

Columns can be modified by assigment:

In [35]:
frame2["debt"] = 1.6
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,1.6
1,2001,Ohio,1.7,1.6
2,2002,Ohio,3.6,1.6
3,2001,Nevada,2.4,1.6
4,2002,Nevada,2.9,1.6
5,2003,Nevada,3.2,1.6


In [36]:
frame2["debt"] = np.arange(6.)
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,0.0
1,2001,Ohio,1.7,1.0
2,2002,Ohio,3.6,2.0
3,2001,Nevada,2.4,3.0
4,2002,Nevada,2.9,4.0
5,2003,Nevada,3.2,5.0


When a Series is assigning to a column it's labels will be realigned with the index of the dataframe, 

In [37]:
val = pd.Series([-1.2, -1.5, -1.7], index=["two", "four", "five"])
frame2["debt"] = val
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


the del keyword will delete a column:

In [38]:
frame2["eastern"] = frame2["state"] == "Ohio"
frame2

Unnamed: 0,year,state,pop,debt,eastern
0,2000,Ohio,1.5,,True
1,2001,Ohio,1.7,,True
2,2002,Ohio,3.6,,True
3,2001,Nevada,2.4,,False
4,2002,Nevada,2.9,,False
5,2003,Nevada,3.2,,False


In [39]:
del frame2["eastern"]
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

Passing a nested dictionary to a DataFrame:

In [40]:
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
              "Nevada": {2001: 2.4, 2002: 2.9}}
frame3 = pd.DataFrame(populations)
frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


To transpose de dataframe use .T

In [41]:
frame3.T

Unnamed: 0,2000,2001,2002
Ohio,1.5,1.7,3.6
Nevada,,2.4,2.9


The inners keys of the nasted dictionary are used to forming de index of the dataframe, and de outters keys the columns. This is only true if no index is specified on the dataframe.

In [42]:
pd.DataFrame(populations, index=[2001, 2002, 2003])

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9
2003,,


Dictioneries of Series are treated the same way

In [43]:
pdata = {"Ohio": frame3["Ohio"][:-1],
        "Nevada": frame3["Nevada"][:2]}
pdata

{'Ohio': 2000    1.5
 2001    1.7
 Name: Ohio, dtype: float64,
 'Nevada': 2000    NaN
 2001    2.4
 Name: Nevada, dtype: float64}

In [44]:
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


Setting the index and columns names atributes:

In [45]:
frame3.index.name = "Year"
frame3.columns.name = "State"
frame3

State,Ohio,Nevada
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


The .to_numpy() method returns the dataframe as a 2-dimensional ndarray:

In [46]:
frame3.to_numpy()

array([[1.5, nan],
       [1.7, 2.4],
       [3.6, 2.9]])

### Index Objects
Panda's index objects are responsible for holding the axis labels and other metadata.

In [47]:
obj = pd.Series(np.arange(3), index=["a", "b", "c"])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [48]:
index[1:]

Index(['b', 'c'], dtype='object')

Index objects are immutable and thus can be modified:

In [49]:
#Index[1] = "d"

In [50]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [51]:
obj2 = pd.Series([-1.5, 2.5, 0], index= labels)
obj2

0   -1.5
1    2.5
2    0.0
dtype: float64

In [52]:
obj2.index is labels

True

### Essential Funcionality

#### Reindexing
Create a new object with the values rearrenged to align with the new index

In [53]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [55]:
obj2 = obj.reindex(["a", "b", "c", "d", "e"])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [56]:
obj3 = pd.Series(["blue", "purple", "yellow"], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [58]:
obj3.reindex(np.arange(6.), method="ffill")

0.0      blue
1.0      blue
2.0    purple
3.0    purple
4.0    yellow
5.0    yellow
dtype: object

In [59]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                    index=["a", "c", "d"],
                    columns=["Ohio", "Texas", "California"])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [61]:
frame2 = frame.reindex(index=["a", "b", "c", "d"])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [62]:
states = ["Texas", "Utah", "California"]
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [63]:
frame.reindex(states, axis="columns")

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


Reindexing using the .loc method

In [64]:
frame.loc[["a", "d", "c"], ["California", "Texas"]]

Unnamed: 0,California,Texas
a,2,1
d,8,7
c,5,4


#### Dropping Entries from an Axis