In [3]:
import pandas as pd
import numpy as np

# Basic data structures in Pandas
Pandas provides two types of classes for handling data:

Series: a one-dimensional labeled array holding data of any type
Dataframe: a two-dimensional data structure that holds data like a two-dimension array or a table wirh rows and columns.

# Object Creation 
## Series

`s = pd.Series(data, index=index)`

Here, data can be many different things:
- a Python dict
- an ndarray
- a scalar value

In [7]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
print(s)
print("-------- \n if no index then:")
pd.Series(np.random.randn(5))

a   -0.458348
b    0.472336
c   -1.891171
d   -0.256879
e    0.129137
dtype: float64
-------- 
 if no index then:


0   -0.369504
1    1.425498
2   -1.883277
3    1.924055
4    0.188907
dtype: float64

In [8]:
# Series can be instantiated from dicts:
d = {"b": 1, "a": 0, "c": 2}
pd.Series(d)

b    1
a    0
c    2
dtype: int64

In [9]:
#if an index is passed the values in data corresponding to the labels in the index will be pulled out.
d = {"a": 0.0, "b": 1.0, "c": 2.0}
pd.Series(d, index=["b", "c", "d", "a"])

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

In [10]:
# From a Scalar
pd.Series(5.0, index=["a", "b", "c", "d", "e"])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

In [11]:
# While Series is ndarray-like, if you need an actual ndarray, then use Series.to_numpy().
s.to_numpy()

array([-0.45834757,  0.47233592, -1.89117096, -0.25687897,  0.12913703])

## DataFrame
DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects. It is generally the most commonly used pandas object. Like Series, DataFrame accepts many different kinds of input:

- Dict of 1D ndarrays, lists, dicts, or Series
- 2-D numpy.ndarray
- Structured or record ndarray
- A Series
- Another DataFrame

In [13]:
#From dict of Series or dicts
d = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [14]:
# From dict of ndarrays / lists
d = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [15]:
pd.DataFrame(d, index=["a", "b", "c", "d"])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


In [16]:
# From a list of dicts
data2 = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}]
pd.DataFrame(data2, index=["first", "second"])


Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [17]:
pd.DataFrame(data2, columns=["a", "b"])

Unnamed: 0,a,b
0,1,2
1,5,10


In [18]:
# From a dict of tuples
pd.DataFrame(
    {
        ("a", "b"): {("A", "B"): 1, ("A", "C"): 2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 8},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10},
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


# Column selection, addition, deletion

## Series
Series acts very similarly to a ndarray and is a valid argument to most NumPy functions. However, operations such as slicing will also slice the index.

In [20]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
print(s)


0.26120983395723774

In [23]:
print(s.iloc[0], "\n------")
print(s.iloc[:3], "\n------")
print(s.iloc[[4, 3, 1]])

0.26120983395723774 
------
a    0.261210
b   -1.207441
c    2.294983
dtype: float64 
------
e   -2.282958
d   -0.265981
b   -1.207441
dtype: float64


In [24]:
s.to_numpy()

array([ 0.26120983, -1.20744076,  2.29498253, -0.26598149, -2.28295766])

A Series is also like a fixed-size dict in that you can get and set values by index label:

In [25]:
s["a"]

0.26120983395723774

In [26]:
s["e"] = 12.0
s

a     0.261210
b    -1.207441
c     2.294983
d    -0.265981
e    12.000000
dtype: float64

In [27]:
"e" in s

True

In [28]:
"f" in s

False

In [29]:
s.get("f", np.nan)

nan

## Dataframe

In [62]:
d = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [63]:
df["one"]

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [64]:
df["three"] = df["one"] * df["two"]
df["flag"] = df["one"] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [65]:
del df["two"]
df

Unnamed: 0,one,three,flag
a,1.0,1.0,False
b,2.0,4.0,False
c,3.0,9.0,True
d,,,False


In [66]:
three = df.pop("three")
three

a    1.0
b    4.0
c    9.0
d    NaN
Name: three, dtype: float64

In [67]:
df["foo"] = "bar"
df["one_trunc"] = df["one"][:2]
df.insert(1, "bar", 1)
df

Unnamed: 0,one,bar,flag,foo,one_trunc
a,1.0,1,False,bar,1.0
b,2.0,1,False,bar,2.0
c,3.0,1,True,bar,
d,,1,False,bar,
