In [8]:
import numpy as np
import pandas as pd

**NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column**

In [9]:
pd.Series(np.arange(11))

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
dtype: int64

In [10]:
pd.DataFrame(np.arange(12).reshape(3,4))

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [11]:
pd.DataFrame(np.arange(12).reshape(3,4),columns=list("ABCD"))

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [12]:
pd.DataFrame(np.arange(12).reshape(3,4),columns=list("ABCD"),index=list("efg"))

Unnamed: 0,A,B,C,D
e,0,1,2,3
f,4,5,6,7
g,8,9,10,11


In [13]:
pd.date_range("20130101", periods=6)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

**Creating our own dataset**

In [14]:
df1= pd.DataFrame(
    {
        "A":1.0,
        "B":pd.Timestamp("20130102"),
        "C":pd.Series(1,index=list(range(4)),dtype="float32"),
        "D":np.array([3]*4 , dtype="int32"),
        "E":pd.Categorical(["test","train","test","train"]),
        "F":"foo",
        "G":pd.Series(np.arange(4))
        
        
    }
)

df1

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [15]:
df1.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G             int64
dtype: object

In [16]:
df1.head(2)

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1


In [17]:
df1.tail(2)

Unnamed: 0,A,B,C,D,E,F,G
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [18]:
df1.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [19]:
df1.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')

**Convert pandas DataFrame to numpy**

In [20]:
df1.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo', 0],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo', 1],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo', 2],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo', 3]],
      dtype=object)

**describe() works based on columns and works only on integer and float values**

In [21]:
df1.describe()

Unnamed: 0,A,C,D,G
count,4.0,4.0,4.0,4.0
mean,1.0,1.0,3.0,1.5
std,0.0,0.0,0.0,1.290994
min,1.0,1.0,3.0,0.0
25%,1.0,1.0,3.0,0.75
50%,1.0,1.0,3.0,1.5
75%,1.0,1.0,3.0,2.25
max,1.0,1.0,3.0,3.0


In [22]:
df1.sort_index(axis=0,ascending=False)

Unnamed: 0,A,B,C,D,E,F,G
3,1.0,2013-01-02,1.0,3,train,foo,3
2,1.0,2013-01-02,1.0,3,test,foo,2
1,1.0,2013-01-02,1.0,3,train,foo,1
0,1.0,2013-01-02,1.0,3,test,foo,0


In [23]:
df1.sort_index(axis=1,ascending=False)

Unnamed: 0,G,F,E,D,C,B,A
0,0,foo,test,3,1.0,2013-01-02,1.0
1,1,foo,train,3,1.0,2013-01-02,1.0
2,2,foo,test,3,1.0,2013-01-02,1.0
3,3,foo,train,3,1.0,2013-01-02,1.0


**Sorting a particular column values**

In [24]:
df1.sort_values( by="G")

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [25]:
df1.sort_values(by ="G",ascending=False)

Unnamed: 0,A,B,C,D,E,F,G
3,1.0,2013-01-02,1.0,3,train,foo,3
2,1.0,2013-01-02,1.0,3,test,foo,2
1,1.0,2013-01-02,1.0,3,train,foo,1
0,1.0,2013-01-02,1.0,3,test,foo,0


# Selection

In [None]:
#optimized pandas data access methods, .at, .iat, .loc and .iloc.

In [26]:
df1[1:3]

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2


In [27]:
df1.loc[[2,0,3]] # fetching rows via indexes

Unnamed: 0,A,B,C,D,E,F,G
2,1.0,2013-01-02,1.0,3,test,foo,2
0,1.0,2013-01-02,1.0,3,test,foo,0
3,1.0,2013-01-02,1.0,3,train,foo,3


In [28]:
df1.loc[1:3,["C","D"]] # fetching rows via index slice and column labels

Unnamed: 0,C,D
1,1.0,3
2,1.0,3
3,1.0,3


In [29]:
df1.iloc[[2,0,3]]

Unnamed: 0,A,B,C,D,E,F,G
2,1.0,2013-01-02,1.0,3,test,foo,2
0,1.0,2013-01-02,1.0,3,test,foo,0
3,1.0,2013-01-02,1.0,3,train,foo,3


In [30]:
df1

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [31]:
df1.iloc[1:3, 2:5] # fetching rows via index slice and column position slice 

Unnamed: 0,C,D,E
1,1.0,3,train
2,1.0,3,test


In [32]:
df1.iloc[[1,2,3],[0,2]]

Unnamed: 0,A,C
1,1.0,1.0
2,1.0,1.0
3,1.0,1.0


In [33]:
df1.iloc[1,1]

Timestamp('2013-01-02 00:00:00')

In [34]:
df1.iat[1,1]

Timestamp('2013-01-02 00:00:00')

# Boolean Indexing

In [35]:
df1[df1["A"]>0]

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [36]:
df1[df1[["A","C"]]>0]

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,NaT,1.0,,,,
1,1.0,NaT,1.0,,,,
2,1.0,NaT,1.0,,,,
3,1.0,NaT,1.0,,,,


In [37]:
dataset=pd.DataFrame(np.random.randn(6,4),columns=list("ABCD"))
dataset

Unnamed: 0,A,B,C,D
0,-0.121731,-0.727969,-1.207433,-0.850415
1,0.410449,-1.786633,-0.778909,0.947975
2,-0.263232,0.296698,-0.56609,-0.479173
3,-1.39256,-0.076779,0.779245,0.237906
4,0.776403,-0.460027,0.301186,-0.196244
5,2.179085,-0.381683,-0.825303,0.136121


In [38]:
dataset=pd.DataFrame(np.random.randn(6,4),columns=list("ABCD"))
dataset["E"]=list("ABABCB")
dataset  

Unnamed: 0,A,B,C,D,E
0,-1.395251,1.368938,-0.090796,0.569034,A
1,-0.007963,-1.944891,-0.904368,-1.257741,B
2,1.012817,-0.824051,0.29056,-0.288347,A
3,1.389583,-0.008331,0.128883,-0.851962,B
4,-0.295355,-1.137465,0.135301,-0.46533,C
5,-0.404792,0.563987,-1.292177,-0.600286,B


In [39]:
dataset[dataset["E"].isin(["A","C"])]  # isin for filtering in pandas

Unnamed: 0,A,B,C,D,E
0,-1.395251,1.368938,-0.090796,0.569034,A
2,1.012817,-0.824051,0.29056,-0.288347,A
4,-0.295355,-1.137465,0.135301,-0.46533,C


# **Setting Data**

In [40]:
dates=pd.date_range("20130101",periods=6)
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,2.173924,0.557179,0.766793,-1.26412
2013-01-02,0.067919,-0.008878,2.551963,1.094445
2013-01-03,2.366117,0.00594,0.065039,-0.42102
2013-01-04,1.580842,0.254888,0.605283,-0.4149
2013-01-05,0.758899,-0.910648,1.057905,0.057695
2013-01-06,-0.077508,-1.092109,-2.760095,-0.008394


In [41]:
df2=df.reindex(index=dates[0:4],columns=list(df.columns)+["E"])
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,2.173924,0.557179,0.766793,-1.26412,
2013-01-02,0.067919,-0.008878,2.551963,1.094445,
2013-01-03,2.366117,0.00594,0.065039,-0.42102,
2013-01-04,1.580842,0.254888,0.605283,-0.4149,


In [42]:
df2=df.reindex(index=dates[0:4],columns=list(df.columns)+["E"])
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,2.173924,0.557179,0.766793,-1.26412,
2013-01-02,0.067919,-0.008878,2.551963,1.094445,
2013-01-03,2.366117,0.00594,0.065039,-0.42102,
2013-01-04,1.580842,0.254888,0.605283,-0.4149,


In [43]:
df2.loc[dates[0] : dates[1],"E"] = 1
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,2.173924,0.557179,0.766793,-1.26412,1.0
2013-01-02,0.067919,-0.008878,2.551963,1.094445,1.0
2013-01-03,2.366117,0.00594,0.065039,-0.42102,
2013-01-04,1.580842,0.254888,0.605283,-0.4149,


In [44]:
df2.dropna()

Unnamed: 0,A,B,C,D,E
2013-01-01,2.173924,0.557179,0.766793,-1.26412,1.0
2013-01-02,0.067919,-0.008878,2.551963,1.094445,1.0


In [45]:
df2.fillna(value=3)

Unnamed: 0,A,B,C,D,E
2013-01-01,2.173924,0.557179,0.766793,-1.26412,1.0
2013-01-02,0.067919,-0.008878,2.551963,1.094445,1.0
2013-01-03,2.366117,0.00594,0.065039,-0.42102,3.0
2013-01-04,1.580842,0.254888,0.605283,-0.4149,3.0


In [46]:
pd.isna(df2)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


# Operations

In [48]:
df = pd.DataFrame(np.random.randint(low=1, high=10, size=(4,6)),columns=list("ABCDEF"))
df

Unnamed: 0,A,B,C,D,E,F
0,2,8,6,3,4,1
1,3,6,8,9,5,2
2,9,9,1,4,3,5
3,3,5,3,1,2,8


In [49]:
df.mean() #mean of columns

A    4.25
B    7.00
C    4.50
D    4.25
E    3.50
F    4.00
dtype: float64

In [50]:
df.mean(1) # mean of rows, axis=1

0    4.000000
1    5.500000
2    5.166667
3    3.666667
dtype: float64

In [51]:
df.mean(0) # mean of rows, axis=0

A    4.25
B    7.00
C    4.50
D    4.25
E    3.50
F    4.00
dtype: float64

In [52]:
import random
emp = pd.DataFrame()
emp["id"]=np.arange(100,110)
emp["dept"] = np.random.choice(["HR","FIN","MKT","IT"],size=(10,))
emp["sal"] = np.random.randint(low=1000, high=10000, size=(10,))
emp

Unnamed: 0,id,dept,sal
0,100,MKT,7066
1,101,IT,3380
2,102,MKT,1835
3,103,FIN,7695
4,104,FIN,9305
5,105,FIN,2299
6,106,MKT,3554
7,107,HR,7089
8,108,MKT,9455
9,109,MKT,8977
