In [3]:
import pandas as pd
import numpy as np

# series -> Matrix with a label

In [4]:
labels  = ["a", 'b', "c"]
data = [10,20,30]
arr = np.array(data)
d= {"a": 10, "b": 20, "c":30}

In [5]:
pd.Series(data = data, index=labels)

a    10
b    20
c    30
dtype: int64

In [6]:
pd.Series(d)["a"]

10

# Data frames

In [7]:
np.random.seed(101)

In [8]:
from numpy.random import randn

In [9]:
df = pd.DataFrame(randn(5,4), index= ["a",'b','c','d','e'], columns=["K", "L", "M", "N"])

In [10]:
df

Unnamed: 0,K,L,M,N
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


# indexing

In [11]:
df["N"]

a    0.503826
b    0.605965
c   -0.589001
d    0.955057
e    0.683509
Name: N, dtype: float64

In [12]:
type(df)

pandas.core.frame.DataFrame

In [13]:
df[["K","L"]]

Unnamed: 0,K,L
a,2.70685,0.628133
b,0.651118,-0.319318
c,-2.018168,0.740122
d,0.188695,-0.758872
e,0.190794,1.978757


### creating new columns

In [14]:
df["O"] = df["K"] + df["L"]

In [15]:
df

Unnamed: 0,K,L,M,N,O
a,2.70685,0.628133,0.907969,0.503826,3.334983
b,0.651118,-0.319318,-0.848077,0.605965,0.3318
c,-2.018168,0.740122,0.528813,-0.589001,-1.278046
d,0.188695,-0.758872,-0.933237,0.955057,-0.570177
e,0.190794,1.978757,2.605967,0.683509,2.169552


# Removing

#### Columns 

In [16]:
df.drop("O", axis=1, inplace=True )

In [17]:
df

Unnamed: 0,K,L,M,N
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


#### Rows

In [18]:
df.drop("e", inplace=True)

In [19]:
df

Unnamed: 0,K,L,M,N
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057


### Selecting Rows

##  range

In [20]:
df.loc["a":"c", "L":"M"]

Unnamed: 0,L,M
a,0.628133,0.907969
b,-0.319318,-0.848077
c,0.740122,0.528813


### Subset

In [21]:
df.loc[["a","c"], ["L", "N"]]

Unnamed: 0,L,N
a,0.628133,0.503826
c,0.740122,-0.589001


In [22]:
df.iloc[0]

K    2.706850
L    0.628133
M    0.907969
N    0.503826
Name: a, dtype: float64

In [23]:
df.loc["a", "M"]

0.9079694464765431

# conditional Selection

In [24]:
bool_df = df > 0

In [25]:
bool_df

Unnamed: 0,K,L,M,N
a,True,True,True,True
b,True,False,False,True
c,False,True,True,False
d,True,False,False,True


In [26]:
df[bool_df]

Unnamed: 0,K,L,M,N
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,,,0.605965
c,,0.740122,0.528813,
d,0.188695,,,0.955057


In [27]:
df[df["L"] > 0]

Unnamed: 0,K,L,M,N
a,2.70685,0.628133,0.907969,0.503826
c,-2.018168,0.740122,0.528813,-0.589001


## and OR

In [28]:
d

{'a': 10, 'b': 20, 'c': 30}

In [29]:
df[(df["K"] > 0) & (df["M"] < 0)]

Unnamed: 0,K,L,M,N
b,0.651118,-0.319318,-0.848077,0.605965
d,0.188695,-0.758872,-0.933237,0.955057


# Reset index

In [30]:
df.reset_index()

Unnamed: 0,index,K,L,M,N
0,a,2.70685,0.628133,0.907969,0.503826
1,b,0.651118,-0.319318,-0.848077,0.605965
2,c,-2.018168,0.740122,0.528813,-0.589001
3,d,0.188695,-0.758872,-0.933237,0.955057


In [31]:
index = "NJ CO MA IL".split()

In [32]:
index

['NJ', 'CO', 'MA', 'IL']

In [33]:
df["states"] = index

In [34]:
df

Unnamed: 0,K,L,M,N,states
a,2.70685,0.628133,0.907969,0.503826,NJ
b,0.651118,-0.319318,-0.848077,0.605965,CO
c,-2.018168,0.740122,0.528813,-0.589001,MA
d,0.188695,-0.758872,-0.933237,0.955057,IL


In [35]:
df.set_index("states")

Unnamed: 0_level_0,K,L,M,N
states,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NJ,2.70685,0.628133,0.907969,0.503826
CO,0.651118,-0.319318,-0.848077,0.605965
MA,-2.018168,0.740122,0.528813,-0.589001
IL,0.188695,-0.758872,-0.933237,0.955057


## Missing Data 

In [36]:
dd = {"A": [1,np.nan,3], "B":[3,np.nan,np.nan], "C": [6,7,8]}

In [37]:
na_frame = pd.DataFrame(dd)

In [38]:
na_frame.dropna(axis=1)

Unnamed: 0,C
0,6
1,7
2,8


#### Threshold -> integer to select how many Nan values you are willing to accept

In [39]:
na_frame.dropna(axis=1, thresh=0)

Unnamed: 0,A,B,C
0,1.0,3.0,6
1,,,7
2,3.0,,8


### Replacing/Filling NA

In [40]:
na_frame.fillna(0.0)

Unnamed: 0,A,B,C
0,1.0,3.0,6
1,0.0,0.0,7
2,3.0,0.0,8


#### Fill with mean

In [41]:
na_frame["A"].fillna(na_frame["A"].mean())

0    1.0
1    2.0
2    3.0
Name: A, dtype: float64

In [42]:
na_frame

Unnamed: 0,A,B,C
0,1.0,3.0,6
1,,,7
2,3.0,,8


## Operations

In [43]:
d = {"col1": [1,2,3,4], "col2":[444,555,666,444], "col3":["abc", "def", "ghi", "xyz"]}

In [48]:
frame = pd.DataFrame(d)
frame

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


#### Finding unique values in a DF

### unique tells you all the unique values in a series

In [51]:
frame["col2"].unique()

array([444, 555, 666])

#### nunique tells you how many unique values there are

In [53]:
frame["col2"].nunique()

3

## value COunts -> table of how many unique values ans how many times its apears

In [57]:
frame["col2"].value_counts()

444    2
555    1
666    1
Name: col2, dtype: int64

### Selecting Data

In [60]:
frame[frame["col1"] > 2]

Unnamed: 0,col1,col2,col3
2,3,666,ghi
3,4,444,xyz


# Apply

In [61]:
def times_two(x):
    return x * 2

In [65]:
frame["col2"].apply(lambda x: x * 3)

0    1332
1    1665
2    1998
3    1332
Name: col2, dtype: int64

# removing columns

In [67]:
frame.drop("col3", axis=1)

Unnamed: 0,col1,col2
0,1,444
1,2,555
2,3,666
3,4,444


# Sort values

In [69]:
frame.sort_values("col2", ascending = False)

Unnamed: 0,col1,col2,col3
2,3,666,ghi
1,2,555,def
0,1,444,abc
3,4,444,xyz


In [70]:
frame.isnull()

Unnamed: 0,col1,col2,col3
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
