In [None]:
import pandas as pd
import numpy as np

**NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column**

**series:** The object supports both integer- and label-based indexing and provides a host of methods for performing operations involving the index. 

In [None]:
pd.Series((np.arange(1,12)))

0      1
1      2
2      3
3      4
4      5
5      6
6      7
7      8
8      9
9     10
10    11
dtype: int64

**DataFrame:**Two-dimensional, size-mutable, potentially heterogeneous tabular data.

Data structure also contains labeled axes (rows and columns). 

In [None]:
pd.DataFrame(np.arange(12).reshape(3,4))

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [None]:
pd.DataFrame(np.arange(12).reshape(4,3),columns= list('ABC')) #gives ABC as columns name

Unnamed: 0,A,B,C
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [None]:
pd.DataFrame(np.arange(12).reshape(4,3),columns=list('ABC'),index=list('efgh'))# index gives name to the rows.

Unnamed: 0,A,B,C
e,0,1,2
f,3,4,5
g,6,7,8
h,9,10,11


**date_range**:Return a fixed frequency DatetimeIndex.

In [None]:
pd.date_range("20130101", periods=6) # DatetimeIndex an immutable container for datetimes

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

**pd.stamp:**Pandas replacement for python datetime.datetime object.

**pd.categorical:**oriCategoricals are a pandas data type that corresponds to the categorical variables in statistics. Such variables take on a fixed and limited number of possible values. For examples – grades, gender, blood group type etc.

**foo:** is used as a place-holder name, usually in example code to signify that the object being named, or the choice of name,

In [None]:
df2 = pd.DataFrame(  #df2 is an dataframe obj
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
        "G": pd.Series(np.arange(4))
    }
)
df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [None]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G             int64
dtype: object

In [None]:
df2.head(2) #Return the first n rows.

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1


In [None]:
df2.tail(2) # Return the last n rows.

Unnamed: 0,A,B,C,D,E,F,G
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [None]:
df2.index #The row labels of the DataFrame.

Int64Index([0, 1, 2, 3], dtype='int64')

In [None]:
df2.columns #The column labels of the DataFrame.

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')

In [None]:
'''
This is expensive
'''
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo', 0],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo', 1],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo', 2],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo', 3]],
      dtype=object)

In [None]:
df2.describe()

Unnamed: 0,A,C,D,G
count,4.0,4.0,4.0,4.0
mean,1.0,1.0,3.0,1.5
std,0.0,0.0,0.0,1.290994
min,1.0,1.0,3.0,0.0
25%,1.0,1.0,3.0,0.75
50%,1.0,1.0,3.0,1.5
75%,1.0,1.0,3.0,2.25
max,1.0,1.0,3.0,3.0


In [None]:


df2.to_numpy

<bound method DataFrame.to_numpy of      A          B    C  D      E    F  G
0  1.0 2013-01-02  1.0  3   test  foo  0
1  1.0 2013-01-02  1.0  3  train  foo  1
2  1.0 2013-01-02  1.0  3   test  foo  2
3  1.0 2013-01-02  1.0  3  train  foo  3>

In [None]:
df2.sort_index(axis=0, ascending=False) #Index are sorted with axis=0

Unnamed: 0,A,B,C,D,E,F,G
3,1.0,2013-01-02,1.0,3,train,foo,3
2,1.0,2013-01-02,1.0,3,test,foo,2
1,1.0,2013-01-02,1.0,3,train,foo,1
0,1.0,2013-01-02,1.0,3,test,foo,0


In [None]:
df2.sort_index(axis=1,ascending=False) #columns are sorted with axis=1

Unnamed: 0,G,F,E,D,C,B,A
0,0,foo,test,3,1.0,2013-01-02,1.0
1,1,foo,train,3,1.0,2013-01-02,1.0
2,2,foo,test,3,1.0,2013-01-02,1.0
3,3,foo,train,3,1.0,2013-01-02,1.0


**Selection**

optimized pandas data access methods, .at, .iat, .loc and .iloc

In [None]:
df2[1:3]

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2


**.loc** is primarily label based, but may also be used with a boolean array.

In [None]:
df2.loc[[2,0,3]]  #fetching rows via indexes

Unnamed: 0,A,B,C,D,E,F,G
2,1.0,2013-01-02,1.0,3,test,foo,2
0,1.0,2013-01-02,1.0,3,test,foo,0
3,1.0,2013-01-02,1.0,3,train,foo,3


In [None]:
df2.loc[1:3,["C","D"]]  # fetching rows via index slice and column labels

Unnamed: 0,C,D
1,1.0,3
2,1.0,3
3,1.0,3


**.iloc** is primarily integer position based (from 0 to length-1 of the axis), but may also be used with a boolean array. 

In [None]:
df2.iloc[1:3, 2:5] # fetching rows via index slice and column position slice

Unnamed: 0,C,D,E
1,1.0,3,train
2,1.0,3,test


In [None]:
df2.iloc[[1, 2, 3], [0, 2]]

Unnamed: 0,A,C
1,1.0,1.0
2,1.0,1.0
3,1.0,1.0


In [None]:
df2.iloc[1,1]     # for getting values explicitly

Timestamp('2013-01-02 00:00:00')

**.iat** method is used to return data in a dataframe at the passed location.

In [None]:
df2.iat[1, 1]

Timestamp('2013-01-02 00:00:00')

**.at**
Access a single value for a row/column label pair.

In [None]:
df2.at[2, "C"]

1.0

**Boolean Indexing**

In [None]:
df2[df2["A"]>0]

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,0
1,1.0,2013-01-02,1.0,3,train,foo,1
2,1.0,2013-01-02,1.0,3,test,foo,2
3,1.0,2013-01-02,1.0,3,train,foo,3


In [None]:
df2[df2[["A","C"]]>0.0]     # Selecting values from a DF where a boolean condition is met

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,NaT,1.0,,,,
1,1.0,NaT,1.0,,,,
2,1.0,NaT,1.0,,,,
3,1.0,NaT,1.0,,,,


In [None]:
df = pd.DataFrame(np.random.randn(6,4),columns=list("ABCD"))
df["E"] = list("ABABCB")
df

Unnamed: 0,A,B,C,D,E
0,0.175241,-1.495411,-0.529236,0.366047,A
1,0.420534,1.497993,-2.05745,-0.901799,B
2,-0.116838,0.288017,0.361347,-1.459866,A
3,0.054767,-0.661701,0.75508,1.063681,B
4,0.546828,-0.131664,0.231393,0.585213,C
5,0.811476,-1.406404,0.753434,0.992653,B


**.isin** Whether each element in the DataFrame is contained in values.

In [None]:
df[df["E"].isin(["A","C"])]        # isin for filtering in pandas

Unnamed: 0,A,B,C,D,E
0,0.175241,-1.495411,-0.529236,0.366047,A
2,-0.116838,0.288017,0.361347,-1.459866,A
4,0.546828,-0.131664,0.231393,0.585213,C


**Setting data**

In [None]:
dates = pd.date_range("20130101", periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))

df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

In [None]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.549379,-0.259074,0.229716,-1.49323,
2013-01-02,-1.895075,-1.319006,-1.285316,1.086855,
2013-01-03,-0.863804,1.115733,-0.583203,0.681543,
2013-01-04,1.633227,0.973444,0.18321,0.831522,


In [None]:
df1.loc[dates[0] : dates[1], "E"] = 1
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.549379,-0.259074,0.229716,-1.49323,1.0
2013-01-02,-1.895075,-1.319006,-1.285316,1.086855,1.0
2013-01-03,-0.863804,1.115733,-0.583203,0.681543,
2013-01-04,1.633227,0.973444,0.18321,0.831522,


**.dropna** Remove missing values.

In [None]:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.549379,-0.259074,0.229716,-1.49323,1.0
2013-01-02,-1.895075,-1.319006,-1.285316,1.086855,1.0


**.fillna** Fill NA/NaN values using the specified method.

In [None]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.549379,-0.259074,0.229716,-1.49323,1.0
2013-01-02,-1.895075,-1.319006,-1.285316,1.086855,1.0
2013-01-03,-0.863804,1.115733,-0.583203,0.681543,5.0
2013-01-04,1.633227,0.973444,0.18321,0.831522,5.0


**.isna** Detect missing values for an array-like object

In [None]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


**Operations**

In [None]:
df = pd.DataFrame(np.random.randint(low=1, high=10, size=(4,6)),columns=list("ABCDEF"))
df

Unnamed: 0,A,B,C,D,E,F
0,3,4,7,1,3,5
1,8,8,2,1,3,8
2,9,1,1,6,8,1
3,7,5,3,7,9,5


In [None]:
df.mean()   #mean of columns

A    6.75
B    4.50
C    3.25
D    3.75
E    5.75
F    4.75
dtype: float64

In [None]:
df.mean(1)    # mean of rows, axis=1

In [None]:
import random
emp = pd.DataFrame()
emp["id"]=np.arange(100,110)
emp["dept"] = np.random.choice(["HR","FIN","MKT","IT"],size=(10,)) #The choice() method returns a list with the randomly selected element from the specified sequence.
emp["sal"] = np.random.randint(low=1000, high=10000, size=(10,))
emp

Unnamed: 0,id,dept,sal
0,100,IT,2430
1,101,IT,2678
2,102,FIN,6132
3,103,MKT,9981
4,104,IT,9981
5,105,MKT,7488
6,106,HR,2179
7,107,IT,9322
8,108,HR,5385
9,109,MKT,1487
