In [None]:
import numpy as np
import pandas as pd

In [None]:
#NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column
pd.Series(np.arange(12))

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
dtype: int64

In [None]:
pd.DataFrame(np.arange(12).reshape(4,3))#range within 12 num and 4 rows and 3 col

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [None]:
pd.DataFrame(np.arange(12).reshape(4,3),columns=list('ABC'))

Unnamed: 0,A,B,C
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [None]:
pd.DataFrame(np.arange(12).reshape(4,3),columns=list('ABC'),index=list('efgh'))

Unnamed: 0,A,B,C
e,0,1,2
f,3,4,5
g,6,7,8
h,9,10,11


In [None]:
   
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
        "G": pd.Series(np.arange(4))
    }
)
df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [None]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G             int64
dtype: object

In [None]:
df2.head(2)#shows top 5 rows in dataframe

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1


In [None]:
df2.tail(2)#shows last 5 rows in dataframe

Unnamed: 0,A,B,C,D,E,F,G
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [None]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [None]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')

In [None]:
df2.to_numpy()#convert df to numpy format

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo', 0],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo', 1],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo', 2],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo', 3]],
      dtype=object)

In [None]:
df2.describe()

Unnamed: 0,A,C,D,G
count,4.0,4.0,4.0,4.0
mean,1.0,1.0,3.0,1.5
std,0.0,0.0,0.0,1.290994
min,1.0,1.0,3.0,0.0
25%,1.0,1.0,3.0,0.75
50%,1.0,1.0,3.0,1.5
75%,1.0,1.0,3.0,2.25
max,1.0,1.0,3.0,3.0


In [None]:
'''
Index are sorted with axis=0
'''
df2.sort_index(axis=0, ascending=False)#start from 0 in desending order

NameError: ignored

In [None]:
'''
columns are sorted with axis=1
'''
df2.sort_index(axis=1, ascending=False)#sorted one by one in colm wise

Unnamed: 0,G,F,E,D,C,B,A
0,0,foo,test,3,1.0,2013-01-02,1.0
1,1,foo,train,3,1.0,2013-01-02,1.0
2,2,foo,test,3,1.0,2013-01-02,1.0
3,3,foo,train,3,1.0,2013-01-02,1.0


In [None]:
'''
Sorting by values
'''
df2.sort_values(by="B")

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


Selection
# optimized pandas data access methods, .at, .iat, .loc and .iloc.


In [None]:
'''
Multi-dimension indexing like numpy is not possible in pandas
Only possible way is to slice rows
'''
df2[1:3]

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2


In [None]:
df2.loc[[2,0,3]] # fetching rows via indexes

Unnamed: 0,A,B,C,D,E,F,G
2,1.0,2013-01-02,1.0,3,test,foo,2
0,1.0,2013-01-02,1.0,3,test,foo,0
3,1.0,2013-01-02,1.0,3,train,foo,3


In [None]:
df2.loc[1:3,["C","D"]] # fetching rows via index slice and column labels

Unnamed: 0,C,D
1,1.0,3
2,1.0,3
3,1.0,3


In [None]:
df2.iloc[1:3, 2:5] # fetching rows via index slice and column position slice

Unnamed: 0,C,D,E
1,1.0,3,train
2,1.0,3,test


In [None]:
df2.iloc[[1, 2, 3], [0, 2]]


Unnamed: 0,A,C
1,1.0,1.0
2,1.0,1.0
3,1.0,1.0


In [None]:
df2.iloc[1,1] # for getting values explicitly

Timestamp('2013-01-02 00:00:00')

In [None]:
df2.iat[1, 1] # Faster method to get scalar, similar to above iloc method


Timestamp('2013-01-02 00:00:00')

Boolean Indexing

In [None]:
df2[df2["A"]>0]

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [None]:
df2[df2[["A","C"]]>0.0] # Selecting values from a DF where a boolean condition is met

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,NaT,1.0,,,,
1,1.0,NaT,1.0,,,,
2,1.0,NaT,1.0,,,,
3,1.0,NaT,1.0,,,,


In [None]:
df = pd.DataFrame(np.random.randn(6,4),columns=list("ABCD"))
df["E"] = list("ABABCB")
df


Unnamed: 0,A,B,C,D,E
0,-0.31022,-0.922255,-0.03211,-0.115059,A
1,-1.023849,-0.690602,0.444947,-2.084597,B
2,0.277101,1.07254,0.775123,1.284252,A
3,-1.275746,-1.150907,-2.706349,0.168237,B
4,-0.431998,-2.618386,-1.131176,0.295557,C
5,0.140618,1.576269,-0.077323,1.248548,B


In [None]:
df[df["E"].isin(["A","C"])] # isin for filtering in pandas


Unnamed: 0,A,B,C,D,E
0,-0.31022,-0.922255,-0.03211,-0.115059,A
2,0.277101,1.07254,0.775123,1.284252,A
4,-0.431998,-2.618386,-1.131176,0.295557,C


Setting data

In [None]:
dates = pd.date_range("20130101", periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))

df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

In [None]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.511981,-0.619044,-0.833464,-1.572507,
2013-01-02,0.589373,1.27038,1.365332,0.300444,
2013-01-03,0.297374,0.208122,0.043534,0.047723,
2013-01-04,-0.568492,0.746082,-1.295042,-0.303325,


In [None]:
df1.loc[dates[0] : dates[1], "E"] = 1
df1


Unnamed: 0,A,B,C,D,E
2013-01-01,-1.511981,-0.619044,-0.833464,-1.572507,1.0
2013-01-02,0.589373,1.27038,1.365332,0.300444,1.0
2013-01-03,0.297374,0.208122,0.043534,0.047723,
2013-01-04,-0.568492,0.746082,-1.295042,-0.303325,


In [None]:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.511981,-0.619044,-0.833464,-1.572507,1.0
2013-01-02,0.589373,1.27038,1.365332,0.300444,1.0


In [None]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.511981,-0.619044,-0.833464,-1.572507,1.0
2013-01-02,0.589373,1.27038,1.365332,0.300444,1.0
2013-01-03,0.297374,0.208122,0.043534,0.047723,5.0
2013-01-04,-0.568492,0.746082,-1.295042,-0.303325,5.0


In [None]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


In [None]:
df = pd.DataFrame(np.random.randint(low=1, high=10, size=(4,6)),columns=list("ABCDEF"))
df

Unnamed: 0,A,B,C,D,E,F
0,4,8,1,8,3,5
1,4,1,1,3,6,6
2,2,9,9,2,1,9
3,3,1,3,3,8,1


In [None]:
df.mean() #mean of columns

A    3.25
B    4.75
C    3.50
D    4.00
E    4.50
F    5.25
dtype: float64

In [None]:
import random
emp = pd.DataFrame()
emp["id"]=np.arange(100,110)
emp["dept"] = np.random.choice(["HR","FIN","MKT","IT"],size=(10,))
emp["sal"] = np.random.randint(low=1000, high=10000, size=(10,))
emp

Unnamed: 0,id,dept,sal
0,100,HR,8499
1,101,IT,9779
2,102,MKT,1350
3,103,IT,9517
4,104,MKT,5854
5,105,FIN,6971
6,106,IT,5194
7,107,HR,4497
8,108,IT,6455
9,109,HR,5248


In [None]:
pd.read_excel("foo.xlsx", "Sheet1", index_col=None, na_values=["NA"])

NameError: ignored