### Python data analysis libraries
- Pandas - Library for data analysis tools to do data manipulation and analysis.
- Matplotlib - Library for data visualization
- NamPy - Fundamental library for scientific computing

In [1]:
# Importing pandas and numpy libraries
import pandas as pd
import numpy as np

#### Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,built on top of the Python programming language.
Important pandas data structures:
- List
- DataFrame

#### Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,built on top of the Python programming language.

Important pandas data structures:

- List        (Create using pandas Series() method)
- DataFrame   (Create using pandas DataFrame() method)

In [2]:
# Create a series from a list. Pandas will create a default integer index.

myList = [11, 12, 13, 14, 15]
s1 = pd.Series(myList)
s1

0    11
1    12
2    13
3    14
4    15
dtype: int64

#### Create a series from a list. Give the list an order of your choice

In [3]:
myList = [21, 22, 23, 24, 25]
order = [1,2,3,4,5]
s2 = pd.Series(myList, index = order)
s2

1    21
2    22
3    23
4    24
5    25
dtype: int64

#### Create a random numpy array and index the random elements with non-integer index

In [4]:
nparr = np.random.randn(5)         # create a random numpy array
index = ['a','b','c','d','e']      # create a non-integer index
s3 = pd.Series(nparr, index=index) # create a panda series using random numpy array
s3

a   -0.015086
b   -0.135006
c   -1.315928
d   -2.260937
e    1.972498
dtype: float64

#### Create a Series by passing a list of values, letting pandas to create a default integer index.

In [5]:
s4 = pd.Series([9, 2, 3, np.nan, 16, 28])
s4

0     9.0
1     2.0
2     3.0
3     NaN
4    16.0
5    28.0
dtype: float64

#### Modifying the index of series

In [6]:
print(s4)
s4.index = ['a','b','c','d','e','f']   # modifying the index 0,1,2,3,4,5 to 'a','b','c','d','e','f'
s4

0     9.0
1     2.0
2     3.0
3     NaN
4    16.0
5    28.0
dtype: float64


a     9.0
b     2.0
c     3.0
d     NaN
e    16.0
f    28.0
dtype: float64

#### Create a DataFrame by passing a NumPy array, with timestamp index and labeled columns

In [7]:
dates = pd.date_range('20210101', periods = 12)
dates

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10', '2021-01-11', '2021-01-12'],
              dtype='datetime64[ns]', freq='D')

In [8]:
df = pd.DataFrame(np.random.randn(12,4), index=dates, columns = list('ABCD'))
df

Unnamed: 0,A,B,C,D
2021-01-01,-1.758891,-1.628457,-0.160035,0.352271
2021-01-02,-0.474657,-0.072683,0.002751,-1.353994
2021-01-03,-0.530687,0.344695,-0.056747,-0.828139
2021-01-04,0.113745,-0.968571,0.641456,0.774153
2021-01-05,-0.875608,-1.534964,-0.386943,1.832037
2021-01-06,-0.259117,-1.562746,-1.351716,-1.417949
2021-01-07,-0.808753,0.200855,-0.271415,-0.201864
2021-01-08,1.132561,-1.12355,-1.012721,-0.590054
2021-01-09,0.361329,-1.042833,0.853928,-0.589723
2021-01-10,0.109946,0.494841,-0.730551,0.601187


#### Create a DataFrame by passing a dictionary of objects that can be converted to series-like

In [9]:
df2 = pd.DataFrame(\
                   {"A": 1.0,\
                    "B": pd.Timestamp('20210707'),\
                    "C": pd.Series(1, index=list(range(5)), dtype = 'float32'),\
                    "D": np.array([3] * 5, dtype = 'int32'),\
                    "E": pd.Categorical(["Test", "Train", "Test", "Test", "Train"]),\
                    "F": "Hello",
                   }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-07-07,1.0,3,Test,Hello
1,1.0,2021-07-07,1.0,3,Train,Hello
2,1.0,2021-07-07,1.0,3,Test,Hello
3,1.0,2021-07-07,1.0,3,Test,Hello
4,1.0,2021-07-07,1.0,3,Train,Hello


#### Note that the data types of each columns are different

In [10]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

### Viewing data

#### Following are example on how to view the top and bottom rows of the frame

In [11]:
df.head(6)

Unnamed: 0,A,B,C,D
2021-01-01,-1.758891,-1.628457,-0.160035,0.352271
2021-01-02,-0.474657,-0.072683,0.002751,-1.353994
2021-01-03,-0.530687,0.344695,-0.056747,-0.828139
2021-01-04,0.113745,-0.968571,0.641456,0.774153
2021-01-05,-0.875608,-1.534964,-0.386943,1.832037
2021-01-06,-0.259117,-1.562746,-1.351716,-1.417949


In [12]:
df.tail(3)

Unnamed: 0,A,B,C,D
2021-01-10,0.109946,0.494841,-0.730551,0.601187
2021-01-11,-0.894102,-0.108319,-0.460409,1.232336
2021-01-12,0.836325,-0.05864,2.466474,-1.520109


#### Display the index and column labels

In [13]:
df.index

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10', '2021-01-11', '2021-01-12'],
              dtype='datetime64[ns]', freq='D')

In [14]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

#### Describing the statistic summary of the data in the dataframe using the pandas describe() method

In [15]:
df.describe()

Unnamed: 0,A,B,C,D
count,12.0,12.0,12.0,12.0
mean,-0.253992,-0.588364,-0.038827,-0.142487
std,0.81193,0.798212,1.002574,1.099311
min,-1.758891,-1.628457,-1.351716,-1.520109
25%,-0.825467,-1.226404,-0.527944,-0.959603
50%,-0.366887,-0.538445,-0.215725,-0.395793
75%,0.175641,0.006234,0.162427,0.644429
max,1.132561,0.494841,2.466474,1.832037


#### We can tranpose the datafrom using the T method.

In [16]:
df = pd.DataFrame(np.random.randn(12,4), index=dates, columns = list('ABCD'))
df

Unnamed: 0,A,B,C,D
2021-01-01,1.678806,0.511775,0.232081,0.961997
2021-01-02,-0.544359,0.323428,0.406318,0.311305
2021-01-03,-0.278951,1.526653,-1.872899,1.492121
2021-01-04,-0.567785,-0.217683,-1.82983,0.811737
2021-01-05,-1.72224,0.682953,-0.632491,0.943979
2021-01-06,0.49141,-1.57806,-0.546541,-0.721281
2021-01-07,-0.135242,-0.423931,-0.94735,-1.150247
2021-01-08,0.293885,0.402565,0.35757,3.001092
2021-01-09,0.694346,0.779812,0.110558,1.459886
2021-01-10,-0.194615,0.163406,-0.013623,0.467213


##### The transpose format of the df

In [17]:
df.T

Unnamed: 0,2021-01-01,2021-01-02,2021-01-03,2021-01-04,2021-01-05,2021-01-06,2021-01-07,2021-01-08,2021-01-09,2021-01-10,2021-01-11,2021-01-12
A,1.678806,-0.544359,-0.278951,-0.567785,-1.72224,0.49141,-0.135242,0.293885,0.694346,-0.194615,1.472092,0.448518
B,0.511775,0.323428,1.526653,-0.217683,0.682953,-1.57806,-0.423931,0.402565,0.779812,0.163406,-0.137168,1.317838
C,0.232081,0.406318,-1.872899,-1.82983,-0.632491,-0.546541,-0.94735,0.35757,0.110558,-0.013623,0.964988,-0.941819
D,0.961997,0.311305,1.492121,0.811737,0.943979,-0.721281,-1.150247,3.001092,1.459886,0.467213,-1.188975,0.857074


#### Sorting by an axis

In [18]:
df.sort_index(axis=1, ascending=False)   # Sort by columns' header (label)

Unnamed: 0,D,C,B,A
2021-01-01,0.961997,0.232081,0.511775,1.678806
2021-01-02,0.311305,0.406318,0.323428,-0.544359
2021-01-03,1.492121,-1.872899,1.526653,-0.278951
2021-01-04,0.811737,-1.82983,-0.217683,-0.567785
2021-01-05,0.943979,-0.632491,0.682953,-1.72224
2021-01-06,-0.721281,-0.546541,-1.57806,0.49141
2021-01-07,-1.150247,-0.94735,-0.423931,-0.135242
2021-01-08,3.001092,0.35757,0.402565,0.293885
2021-01-09,1.459886,0.110558,0.779812,0.694346
2021-01-10,0.467213,-0.013623,0.163406,-0.194615


In [19]:
df.sort_index(axis=1, ascending=True)

Unnamed: 0,A,B,C,D
2021-01-01,1.678806,0.511775,0.232081,0.961997
2021-01-02,-0.544359,0.323428,0.406318,0.311305
2021-01-03,-0.278951,1.526653,-1.872899,1.492121
2021-01-04,-0.567785,-0.217683,-1.82983,0.811737
2021-01-05,-1.72224,0.682953,-0.632491,0.943979
2021-01-06,0.49141,-1.57806,-0.546541,-0.721281
2021-01-07,-0.135242,-0.423931,-0.94735,-1.150247
2021-01-08,0.293885,0.402565,0.35757,3.001092
2021-01-09,0.694346,0.779812,0.110558,1.459886
2021-01-10,-0.194615,0.163406,-0.013623,0.467213


In [20]:
df.sort_index(axis=1)     # by default header is in ascending order

Unnamed: 0,A,B,C,D
2021-01-01,1.678806,0.511775,0.232081,0.961997
2021-01-02,-0.544359,0.323428,0.406318,0.311305
2021-01-03,-0.278951,1.526653,-1.872899,1.492121
2021-01-04,-0.567785,-0.217683,-1.82983,0.811737
2021-01-05,-1.72224,0.682953,-0.632491,0.943979
2021-01-06,0.49141,-1.57806,-0.546541,-0.721281
2021-01-07,-0.135242,-0.423931,-0.94735,-1.150247
2021-01-08,0.293885,0.402565,0.35757,3.001092
2021-01-09,0.694346,0.779812,0.110558,1.459886
2021-01-10,-0.194615,0.163406,-0.013623,0.467213


#### Sorting by the values of the dataframe

In [21]:
df.sort_values(by='B', ascending = False)

Unnamed: 0,A,B,C,D
2021-01-03,-0.278951,1.526653,-1.872899,1.492121
2021-01-12,0.448518,1.317838,-0.941819,0.857074
2021-01-09,0.694346,0.779812,0.110558,1.459886
2021-01-05,-1.72224,0.682953,-0.632491,0.943979
2021-01-01,1.678806,0.511775,0.232081,0.961997
2021-01-08,0.293885,0.402565,0.35757,3.001092
2021-01-02,-0.544359,0.323428,0.406318,0.311305
2021-01-10,-0.194615,0.163406,-0.013623,0.467213
2021-01-11,1.472092,-0.137168,0.964988,-1.188975
2021-01-04,-0.567785,-0.217683,-1.82983,0.811737


In [22]:
df.sort_values(by='B')   # By default, the sorting order is ascending

Unnamed: 0,A,B,C,D
2021-01-06,0.49141,-1.57806,-0.546541,-0.721281
2021-01-07,-0.135242,-0.423931,-0.94735,-1.150247
2021-01-04,-0.567785,-0.217683,-1.82983,0.811737
2021-01-11,1.472092,-0.137168,0.964988,-1.188975
2021-01-10,-0.194615,0.163406,-0.013623,0.467213
2021-01-02,-0.544359,0.323428,0.406318,0.311305
2021-01-08,0.293885,0.402565,0.35757,3.001092
2021-01-01,1.678806,0.511775,0.232081,0.961997
2021-01-05,-1.72224,0.682953,-0.632491,0.943979
2021-01-09,0.694346,0.779812,0.110558,1.459886


#### Selecting (projecting) a single column, which yields a Series, equivalent to df.A

In [23]:
df['A']

2021-01-01    1.678806
2021-01-02   -0.544359
2021-01-03   -0.278951
2021-01-04   -0.567785
2021-01-05   -1.722240
2021-01-06    0.491410
2021-01-07   -0.135242
2021-01-08    0.293885
2021-01-09    0.694346
2021-01-10   -0.194615
2021-01-11    1.472092
2021-01-12    0.448518
Freq: D, Name: A, dtype: float64

In [24]:
df['C']

2021-01-01    0.232081
2021-01-02    0.406318
2021-01-03   -1.872899
2021-01-04   -1.829830
2021-01-05   -0.632491
2021-01-06   -0.546541
2021-01-07   -0.947350
2021-01-08    0.357570
2021-01-09    0.110558
2021-01-10   -0.013623
2021-01-11    0.964988
2021-01-12   -0.941819
Freq: D, Name: C, dtype: float64

In [25]:
df.C

2021-01-01    0.232081
2021-01-02    0.406318
2021-01-03   -1.872899
2021-01-04   -1.829830
2021-01-05   -0.632491
2021-01-06   -0.546541
2021-01-07   -0.947350
2021-01-08    0.357570
2021-01-09    0.110558
2021-01-10   -0.013623
2021-01-11    0.964988
2021-01-12   -0.941819
Freq: D, Name: C, dtype: float64

In [26]:
df.loc[:,['B','D','A']]

Unnamed: 0,B,D,A
2021-01-01,0.511775,0.961997,1.678806
2021-01-02,0.323428,0.311305,-0.544359
2021-01-03,1.526653,1.492121,-0.278951
2021-01-04,-0.217683,0.811737,-0.567785
2021-01-05,0.682953,0.943979,-1.72224
2021-01-06,-1.57806,-0.721281,0.49141
2021-01-07,-0.423931,-1.150247,-0.135242
2021-01-08,0.402565,3.001092,0.293885
2021-01-09,0.779812,1.459886,0.694346
2021-01-10,0.163406,0.467213,-0.194615


#### Selecting via [], which slices the rows.

In [27]:
df[0:3]

Unnamed: 0,A,B,C,D
2021-01-01,1.678806,0.511775,0.232081,0.961997
2021-01-02,-0.544359,0.323428,0.406318,0.311305
2021-01-03,-0.278951,1.526653,-1.872899,1.492121


In [28]:
df["20210104":"20210106"]

Unnamed: 0,A,B,C,D
2021-01-04,-0.567785,-0.217683,-1.82983,0.811737
2021-01-05,-1.72224,0.682953,-0.632491,0.943979
2021-01-06,0.49141,-1.57806,-0.546541,-0.721281


In [29]:
df[3:]    # display from fourth rows to the end

Unnamed: 0,A,B,C,D
2021-01-04,-0.567785,-0.217683,-1.82983,0.811737
2021-01-05,-1.72224,0.682953,-0.632491,0.943979
2021-01-06,0.49141,-1.57806,-0.546541,-0.721281
2021-01-07,-0.135242,-0.423931,-0.94735,-1.150247
2021-01-08,0.293885,0.402565,0.35757,3.001092
2021-01-09,0.694346,0.779812,0.110558,1.459886
2021-01-10,-0.194615,0.163406,-0.013623,0.467213
2021-01-11,1.472092,-0.137168,0.964988,-1.188975
2021-01-12,0.448518,1.317838,-0.941819,0.857074


In [30]:
df[2:8]

Unnamed: 0,A,B,C,D
2021-01-03,-0.278951,1.526653,-1.872899,1.492121
2021-01-04,-0.567785,-0.217683,-1.82983,0.811737
2021-01-05,-1.72224,0.682953,-0.632491,0.943979
2021-01-06,0.49141,-1.57806,-0.546541,-0.721281
2021-01-07,-0.135242,-0.423931,-0.94735,-1.150247
2021-01-08,0.293885,0.402565,0.35757,3.001092


#### Getting a cross section using a label

In [31]:
df[0:1]

Unnamed: 0,A,B,C,D
2021-01-01,1.678806,0.511775,0.232081,0.961997


In [32]:
df.loc[dates[0]]

A    1.678806
B    0.511775
C    0.232081
D    0.961997
Name: 2021-01-01 00:00:00, dtype: float64

In [33]:
df.loc[dates[7]]

A    0.293885
B    0.402565
C    0.357570
D    3.001092
Name: 2021-01-08 00:00:00, dtype: float64

#### Selecting on a multi-axis by label

In [34]:
df.loc[:,["A","C"]]

Unnamed: 0,A,C
2021-01-01,1.678806,0.232081
2021-01-02,-0.544359,0.406318
2021-01-03,-0.278951,-1.872899
2021-01-04,-0.567785,-1.82983
2021-01-05,-1.72224,-0.632491
2021-01-06,0.49141,-0.546541
2021-01-07,-0.135242,-0.94735
2021-01-08,0.293885,0.35757
2021-01-09,0.694346,0.110558
2021-01-10,-0.194615,-0.013623


#### Showing label slicing, both endpoints are included

In [35]:
df.loc["20210102":"20210104", ["A", "B"]]

Unnamed: 0,A,B
2021-01-02,-0.544359,0.323428
2021-01-03,-0.278951,1.526653
2021-01-04,-0.567785,-0.217683


In [36]:
df.loc["20210110":,["A", "B", "C", "D"]]

Unnamed: 0,A,B,C,D
2021-01-10,-0.194615,0.163406,-0.013623,0.467213
2021-01-11,1.472092,-0.137168,0.964988,-1.188975
2021-01-12,0.448518,1.317838,-0.941819,0.857074


#### Reduction in the dimensions of the returned object

In [37]:
df.loc["20210102",["A","B"]]

A   -0.544359
B    0.323428
Name: 2021-01-02 00:00:00, dtype: float64

#### For getting a scalar value:
#### Scalars are single values representing one unit of data, such as an integer or boolean, instead of data structure like tuple, which composed of many scalars values.

In [38]:
df.loc[dates[1], "A"]

-0.5443586217730633

#### Alternative method for getting fast access to a scala (equivalent to the previous method)

In [39]:
df.at[dates[1], "A"]

-0.5443586217730633

### Selection by postion

#### Select via the position of the passed integers.

In [40]:
df.iloc[9]

A   -0.194615
B    0.163406
C   -0.013623
D    0.467213
Name: 2021-01-10 00:00:00, dtype: float64

In [41]:
#### Integer slices is similar to Numpy/Python

In [42]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2021-01-04,-0.567785,-0.217683
2021-01-05,-1.72224,0.682953


#### Integer position locations is similar to the NumPy/Python

In [65]:
print(df)
df.iloc[[1,2,4],[0,2]]
df1=df.iloc[[1,2,4],[0,2]]
print(df1)

                   A         B         C  D    F
2021-01-01  0.000000  0.000000  0.232081  5  NaN
2021-01-02 -0.544359  0.323428  0.406318  5  1.0
2021-01-03 -0.278951  1.526653 -1.872899  5  2.0
2021-01-04 -0.567785 -0.217683 -1.829830  5  3.0
2021-01-05 -1.722240  0.682953 -0.632491  5  4.0
2021-01-06  0.491410 -1.578060 -0.546541  5  5.0
2021-01-07 -0.135242 -0.423931 -0.947350  5  6.0
2021-01-08  0.293885  0.402565  0.357570  5  NaN
2021-01-09  0.694346  0.779812  0.110558  5  NaN
2021-01-10 -0.194615  0.163406 -0.013623  5  NaN
2021-01-11  1.472092 -0.137168  0.964988  5  NaN
2021-01-12  0.448518  1.317838 -0.941819  5  NaN
                   A         C
2021-01-02 -0.544359  0.406318
2021-01-03 -0.278951 -1.872899
2021-01-05 -1.722240 -0.632491


#### Slicing rows explicity

In [44]:
print(df)
df.iloc[1:3,:]

                   A         B         C         D
2021-01-01  1.678806  0.511775  0.232081  0.961997
2021-01-02 -0.544359  0.323428  0.406318  0.311305
2021-01-03 -0.278951  1.526653 -1.872899  1.492121
2021-01-04 -0.567785 -0.217683 -1.829830  0.811737
2021-01-05 -1.722240  0.682953 -0.632491  0.943979
2021-01-06  0.491410 -1.578060 -0.546541 -0.721281
2021-01-07 -0.135242 -0.423931 -0.947350 -1.150247
2021-01-08  0.293885  0.402565  0.357570  3.001092
2021-01-09  0.694346  0.779812  0.110558  1.459886
2021-01-10 -0.194615  0.163406 -0.013623  0.467213
2021-01-11  1.472092 -0.137168  0.964988 -1.188975
2021-01-12  0.448518  1.317838 -0.941819  0.857074


Unnamed: 0,A,B,C,D
2021-01-02,-0.544359,0.323428,0.406318,0.311305
2021-01-03,-0.278951,1.526653,-1.872899,1.492121


#### Slicing columns explicitly

In [45]:
print(df)
df.iloc[:,1:3]
df

                   A         B         C         D
2021-01-01  1.678806  0.511775  0.232081  0.961997
2021-01-02 -0.544359  0.323428  0.406318  0.311305
2021-01-03 -0.278951  1.526653 -1.872899  1.492121
2021-01-04 -0.567785 -0.217683 -1.829830  0.811737
2021-01-05 -1.722240  0.682953 -0.632491  0.943979
2021-01-06  0.491410 -1.578060 -0.546541 -0.721281
2021-01-07 -0.135242 -0.423931 -0.947350 -1.150247
2021-01-08  0.293885  0.402565  0.357570  3.001092
2021-01-09  0.694346  0.779812  0.110558  1.459886
2021-01-10 -0.194615  0.163406 -0.013623  0.467213
2021-01-11  1.472092 -0.137168  0.964988 -1.188975
2021-01-12  0.448518  1.317838 -0.941819  0.857074


Unnamed: 0,A,B,C,D
2021-01-01,1.678806,0.511775,0.232081,0.961997
2021-01-02,-0.544359,0.323428,0.406318,0.311305
2021-01-03,-0.278951,1.526653,-1.872899,1.492121
2021-01-04,-0.567785,-0.217683,-1.82983,0.811737
2021-01-05,-1.72224,0.682953,-0.632491,0.943979
2021-01-06,0.49141,-1.57806,-0.546541,-0.721281
2021-01-07,-0.135242,-0.423931,-0.94735,-1.150247
2021-01-08,0.293885,0.402565,0.35757,3.001092
2021-01-09,0.694346,0.779812,0.110558,1.459886
2021-01-10,-0.194615,0.163406,-0.013623,0.467213


#### getting values from dataframe explicitly

In [46]:
df.iloc[1,1]

0.3234284939574062

#### faster if using iat.

In [47]:
df.iat[1,1]   # faster access to scala value equivalent as previous method

0.3234284939574062

### Boolen indexing
Using a single column's values to select data

In [48]:
df[df['C'] > 0]

Unnamed: 0,A,B,C,D
2021-01-01,1.678806,0.511775,0.232081,0.961997
2021-01-02,-0.544359,0.323428,0.406318,0.311305
2021-01-08,0.293885,0.402565,0.35757,3.001092
2021-01-09,0.694346,0.779812,0.110558,1.459886
2021-01-11,1.472092,-0.137168,0.964988,-1.188975


#### Selecting values from a DataFrame where a boolean condition is met.

In [49]:
df[df > 0]

Unnamed: 0,A,B,C,D
2021-01-01,1.678806,0.511775,0.232081,0.961997
2021-01-02,,0.323428,0.406318,0.311305
2021-01-03,,1.526653,,1.492121
2021-01-04,,,,0.811737
2021-01-05,,0.682953,,0.943979
2021-01-06,0.49141,,,
2021-01-07,,,,
2021-01-08,0.293885,0.402565,0.35757,3.001092
2021-01-09,0.694346,0.779812,0.110558,1.459886
2021-01-10,,0.163406,,0.467213


#### Using the isin() method for filtering

In [50]:
df2 = df.copy()
df2['E'] = ['Q1','Q1','Q1','Q2','Q2','Q2','Q3','Q3','Q3','Q4','Q4','Q4']
print(df2)
#
df2[df2['E'].isin(['Q1', 'Q3'])]

                   A         B         C         D   E
2021-01-01  1.678806  0.511775  0.232081  0.961997  Q1
2021-01-02 -0.544359  0.323428  0.406318  0.311305  Q1
2021-01-03 -0.278951  1.526653 -1.872899  1.492121  Q1
2021-01-04 -0.567785 -0.217683 -1.829830  0.811737  Q2
2021-01-05 -1.722240  0.682953 -0.632491  0.943979  Q2
2021-01-06  0.491410 -1.578060 -0.546541 -0.721281  Q2
2021-01-07 -0.135242 -0.423931 -0.947350 -1.150247  Q3
2021-01-08  0.293885  0.402565  0.357570  3.001092  Q3
2021-01-09  0.694346  0.779812  0.110558  1.459886  Q3
2021-01-10 -0.194615  0.163406 -0.013623  0.467213  Q4
2021-01-11  1.472092 -0.137168  0.964988 -1.188975  Q4
2021-01-12  0.448518  1.317838 -0.941819  0.857074  Q4


Unnamed: 0,A,B,C,D,E
2021-01-01,1.678806,0.511775,0.232081,0.961997,Q1
2021-01-02,-0.544359,0.323428,0.406318,0.311305,Q1
2021-01-03,-0.278951,1.526653,-1.872899,1.492121,Q1
2021-01-07,-0.135242,-0.423931,-0.94735,-1.150247,Q3
2021-01-08,0.293885,0.402565,0.35757,3.001092,Q3
2021-01-09,0.694346,0.779812,0.110558,1.459886,Q3


### Setting
Setting a new column automatically align the data by the indexes

In [51]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20210102", periods = 6))
s1
#
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2021-01-01,1.678806,0.511775,0.232081,0.961997,
2021-01-02,-0.544359,0.323428,0.406318,0.311305,1.0
2021-01-03,-0.278951,1.526653,-1.872899,1.492121,2.0
2021-01-04,-0.567785,-0.217683,-1.82983,0.811737,3.0
2021-01-05,-1.72224,0.682953,-0.632491,0.943979,4.0
2021-01-06,0.49141,-1.57806,-0.546541,-0.721281,5.0
2021-01-07,-0.135242,-0.423931,-0.94735,-1.150247,6.0
2021-01-08,0.293885,0.402565,0.35757,3.001092,
2021-01-09,0.694346,0.779812,0.110558,1.459886,
2021-01-10,-0.194615,0.163406,-0.013623,0.467213,


In [52]:
# Setting values by label:
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2021-01-01,0.0,0.511775,0.232081,0.961997,
2021-01-02,-0.544359,0.323428,0.406318,0.311305,1.0
2021-01-03,-0.278951,1.526653,-1.872899,1.492121,2.0
2021-01-04,-0.567785,-0.217683,-1.82983,0.811737,3.0
2021-01-05,-1.72224,0.682953,-0.632491,0.943979,4.0
2021-01-06,0.49141,-1.57806,-0.546541,-0.721281,5.0
2021-01-07,-0.135242,-0.423931,-0.94735,-1.150247,6.0
2021-01-08,0.293885,0.402565,0.35757,3.001092,
2021-01-09,0.694346,0.779812,0.110558,1.459886,
2021-01-10,-0.194615,0.163406,-0.013623,0.467213,


In [53]:
# Setting values by position:
df.iat[0,1] = 0
df

Unnamed: 0,A,B,C,D,F
2021-01-01,0.0,0.0,0.232081,0.961997,
2021-01-02,-0.544359,0.323428,0.406318,0.311305,1.0
2021-01-03,-0.278951,1.526653,-1.872899,1.492121,2.0
2021-01-04,-0.567785,-0.217683,-1.82983,0.811737,3.0
2021-01-05,-1.72224,0.682953,-0.632491,0.943979,4.0
2021-01-06,0.49141,-1.57806,-0.546541,-0.721281,5.0
2021-01-07,-0.135242,-0.423931,-0.94735,-1.150247,6.0
2021-01-08,0.293885,0.402565,0.35757,3.001092,
2021-01-09,0.694346,0.779812,0.110558,1.459886,
2021-01-10,-0.194615,0.163406,-0.013623,0.467213,


In [54]:
# Setting by assigning with a Numpy array
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2021-01-01,0.0,0.0,0.232081,5,
2021-01-02,-0.544359,0.323428,0.406318,5,1.0
2021-01-03,-0.278951,1.526653,-1.872899,5,2.0
2021-01-04,-0.567785,-0.217683,-1.82983,5,3.0
2021-01-05,-1.72224,0.682953,-0.632491,5,4.0
2021-01-06,0.49141,-1.57806,-0.546541,5,5.0
2021-01-07,-0.135242,-0.423931,-0.94735,5,6.0
2021-01-08,0.293885,0.402565,0.35757,5,
2021-01-09,0.694346,0.779812,0.110558,5,
2021-01-10,-0.194615,0.163406,-0.013623,5,


In [55]:
# A where operation with setting
df2 = df.copy()
print(df2)
print()
df2[df2 > 0] = -df2
df2

                   A         B         C  D    F
2021-01-01  0.000000  0.000000  0.232081  5  NaN
2021-01-02 -0.544359  0.323428  0.406318  5  1.0
2021-01-03 -0.278951  1.526653 -1.872899  5  2.0
2021-01-04 -0.567785 -0.217683 -1.829830  5  3.0
2021-01-05 -1.722240  0.682953 -0.632491  5  4.0
2021-01-06  0.491410 -1.578060 -0.546541  5  5.0
2021-01-07 -0.135242 -0.423931 -0.947350  5  6.0
2021-01-08  0.293885  0.402565  0.357570  5  NaN
2021-01-09  0.694346  0.779812  0.110558  5  NaN
2021-01-10 -0.194615  0.163406 -0.013623  5  NaN
2021-01-11  1.472092 -0.137168  0.964988  5  NaN
2021-01-12  0.448518  1.317838 -0.941819  5  NaN



Unnamed: 0,A,B,C,D,F
2021-01-01,0.0,0.0,-0.232081,-5,
2021-01-02,-0.544359,-0.323428,-0.406318,-5,-1.0
2021-01-03,-0.278951,-1.526653,-1.872899,-5,-2.0
2021-01-04,-0.567785,-0.217683,-1.82983,-5,-3.0
2021-01-05,-1.72224,-0.682953,-0.632491,-5,-4.0
2021-01-06,-0.49141,-1.57806,-0.546541,-5,-5.0
2021-01-07,-0.135242,-0.423931,-0.94735,-5,-6.0
2021-01-08,-0.293885,-0.402565,-0.35757,-5,
2021-01-09,-0.694346,-0.779812,-0.110558,-5,
2021-01-10,-0.194615,-0.163406,-0.013623,-5,


### Assignment related operations

#### Add a new column to a dataframe

In [56]:
df2 = pd.DataFrame(\
                   {"A": 1.0,\
                    "B": pd.Timestamp('20210707'),\
                    "C": pd.Series(1, index=list(range(5)), dtype = 'float32'),\
                    "D": np.array([3] * 5, dtype = 'int32'),\
                    "E": pd.Categorical(["Test", "Train", "Test", "Test", "Train"]),\
                    "F": "Hello",
                   }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-07-07,1.0,3,Test,Hello
1,1.0,2021-07-07,1.0,3,Train,Hello
2,1.0,2021-07-07,1.0,3,Test,Hello
3,1.0,2021-07-07,1.0,3,Test,Hello
4,1.0,2021-07-07,1.0,3,Train,Hello


In [57]:
# Create values for the new column
attack = ["Normal", "DOS attack", "DOS attack", "Normal", "Buffer-overflow attack"]
df2["G"] = attack  # add the new column to the dataframe
df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2021-07-07,1.0,3,Test,Hello,Normal
1,1.0,2021-07-07,1.0,3,Train,Hello,DOS attack
2,1.0,2021-07-07,1.0,3,Test,Hello,DOS attack
3,1.0,2021-07-07,1.0,3,Test,Hello,Normal
4,1.0,2021-07-07,1.0,3,Train,Hello,Buffer-overflow attack


### Compute a Z-Score
Z-Score transforms the data into a distribution with a mean of 0 and a standard deviation of 1. It is computed by subtracting each value by the mean of the corresponding features and then dividing by the standard deviation.

In [58]:
# Creating a numpy random numbers
nparr = np.random.randn(5)         # create a random numpy array
s = pd.Series(nparr)               # create a panda series using random numpy array
print(s)
s = (s - np.mean(s)) / np.std(s)
print(s)
print('mean: ', np.mean(s))
print('std: ', np.std(s))

0   -2.353993
1   -0.558280
2    1.795839
3   -0.948879
4    2.016260
dtype: float64
0   -1.398761
1   -0.327269
2    1.077421
3   -0.560337
4    1.208945
dtype: float64
mean:  -4.4408920985006264e-17
std:  1.0


#### Using stats method of scipy

In [59]:
from scipy import stats
s = pd.Series(nparr)
print(s)
s = stats.zscore(s)
print('mean: ', np.mean(s))
print('std: ', np.std(s))

0   -2.353993
1   -0.558280
2    1.795839
3   -0.948879
4    2.016260
dtype: float64
mean:  -4.4408920985006264e-17
std:  1.0


### Discretise
To convert (a set of values) into an equivalent discrete space, often for the purposes of easier processing.

In [60]:
# Using pandas dataframe access
df2['G'] = ["Normal", "DOS attack", "DOS attack", "Normal", "Buffer-overflow attack"]
print(df2)
print()
print('Before discretization: ')
print(df2['G'])
df2.loc[df2['G'] != 'Normal', 'G'] = -1       # if content is other than 'Normal' set to -1
df2.loc[df2['G'] == 'Normal', 'G'] = 1      # if content is 'Normal' set to 1
print()
print('After discretization: ')
print(df2['G'])

     A          B    C  D      E      F                       G
0  1.0 2021-07-07  1.0  3   Test  Hello                  Normal
1  1.0 2021-07-07  1.0  3  Train  Hello              DOS attack
2  1.0 2021-07-07  1.0  3   Test  Hello              DOS attack
3  1.0 2021-07-07  1.0  3   Test  Hello                  Normal
4  1.0 2021-07-07  1.0  3  Train  Hello  Buffer-overflow attack

Before discretization: 
0                    Normal
1                DOS attack
2                DOS attack
3                    Normal
4    Buffer-overflow attack
Name: G, dtype: object

After discretization: 
0     1
1    -1
2    -1
3     1
4    -1
Name: G, dtype: object


In [61]:
#### Using Python replace method
df2['G'] = ["Normal", "DOS attack", "DOS attack", "Normal", "Buffer-overflow attack"]
print(df2)
print()
print('Before discretization: ')
print(df2['G'])
temp = {}
for val in df2['G'].unique():
    if val == 'Normal':
        temp[val] = 1
    else:
        temp[val] = -1
df2['G'].replace(temp, inplace = True)    # replace column 'G' with temp
print()
print('After discretization: ')
print(df2['G'])

     A          B    C  D      E      F                       G
0  1.0 2021-07-07  1.0  3   Test  Hello                  Normal
1  1.0 2021-07-07  1.0  3  Train  Hello              DOS attack
2  1.0 2021-07-07  1.0  3   Test  Hello              DOS attack
3  1.0 2021-07-07  1.0  3   Test  Hello                  Normal
4  1.0 2021-07-07  1.0  3  Train  Hello  Buffer-overflow attack

Before discretization: 
0                    Normal
1                DOS attack
2                DOS attack
3                    Normal
4    Buffer-overflow attack
Name: G, dtype: object

After discretization: 
0    1
1   -1
2   -1
3    1
4   -1
Name: G, dtype: int64


### One-hot-encoding
one-hot-encoding is a process in the data processing that is applied to categorical data. to convert it into a binary vector representation for use in machine learning algorithm.

In [62]:
df2['E'] = ['Test','Train','Test','Test','Train']
print(df2)
print()
ohe = pd.get_dummies(df2['E'], prefix = 'Category')
print(ohe)
# add the one-hot-encoding (ohe) to the dataframe (df2) using pandas concate method
df2 =  pd.concat([df2, ohe], axis = 1)
df2

     A          B    C  D      E      F  G
0  1.0 2021-07-07  1.0  3   Test  Hello  1
1  1.0 2021-07-07  1.0  3  Train  Hello -1
2  1.0 2021-07-07  1.0  3   Test  Hello -1
3  1.0 2021-07-07  1.0  3   Test  Hello  1
4  1.0 2021-07-07  1.0  3  Train  Hello -1

   Category_Test  Category_Train
0              1               0
1              0               1
2              1               0
3              1               0
4              0               1


Unnamed: 0,A,B,C,D,E,F,G,Category_Test,Category_Train
0,1.0,2021-07-07,1.0,3,Test,Hello,1,1,0
1,1.0,2021-07-07,1.0,3,Train,Hello,-1,0,1
2,1.0,2021-07-07,1.0,3,Test,Hello,-1,1,0
3,1.0,2021-07-07,1.0,3,Test,Hello,1,1,0
4,1.0,2021-07-07,1.0,3,Train,Hello,-1,0,1


In [63]:
print(df2)
print()
ohe = pd.get_dummies(df2['E'], prefix = 'Category')
print(ohe)
print('ohe[:,1]', ohe['Category_Test'])
print('ohe[:,0]', ohe['Category_Train'])
# using DataFrame.insert() to add a column in a specific position
df2.insert(5, "Category_Test", ohe['Category_Test'], True)
df2.insert(6, "Category_Train", ohe['Category_Train'], True)
df2

     A          B    C  D      E      F  G  Category_Test  Category_Train
0  1.0 2021-07-07  1.0  3   Test  Hello  1              1               0
1  1.0 2021-07-07  1.0  3  Train  Hello -1              0               1
2  1.0 2021-07-07  1.0  3   Test  Hello -1              1               0
3  1.0 2021-07-07  1.0  3   Test  Hello  1              1               0
4  1.0 2021-07-07  1.0  3  Train  Hello -1              0               1

   Category_Test  Category_Train
0              1               0
1              0               1
2              1               0
3              1               0
4              0               1
ohe[:,1] 0    1
1    0
2    1
3    1
4    0
Name: Category_Test, dtype: uint8
ohe[:,0] 0    0
1    1
2    0
3    0
4    1
Name: Category_Train, dtype: uint8


Unnamed: 0,A,B,C,D,E,Category_Test,Category_Train,F,G,Category_Test.1,Category_Train.1
0,1.0,2021-07-07,1.0,3,Test,1,0,Hello,1,1,0
1,1.0,2021-07-07,1.0,3,Train,0,1,Hello,-1,0,1
2,1.0,2021-07-07,1.0,3,Test,1,0,Hello,-1,1,0
3,1.0,2021-07-07,1.0,3,Test,1,0,Hello,1,1,0
4,1.0,2021-07-07,1.0,3,Train,0,1,Hello,-1,0,1


### Binning
#### Binning, also called discretization, is a technique for reducing the cardinality of continuous and discrete data. Binning groups related values together in bins to reduce the number of distinct values. Binning can improve model quality by strengthening the relationship between attributes.

In [64]:
# The size of bins are determined such that equal
# data sample are divided into equal sized bins.
# This is done using Quantile. For example, if 
# there are 5 bins, the dataset is divided into 
# five quatiles. Each quatile containing 20% of  
# total area of distribution.
#
# For example, 
df3 = pd.DataFrame(\
                   {"A": [181, 239, 235, 219, 217, 123, 260, 100, 300, 168, 149, 186],\
                    "B": pd.Timestamp('20210707'),\
                    "C": pd.Series(1, index=list(range(12)), dtype = 'float32'),\
                    "D": np.array(np.random.randn(12)),\
                    "E": pd.Categorical(["Test", "Train", "Test", "Test", "Train", "Train",\
                                         "Train", "Test", "Train", "Train", "Test", "Test"]),\
                    "F": "Hello",
                   }
)
print(df3)
# Defining a function  binning for reusability
def binning(data, numOfBin, col):
    data[col] = pd.cut(data[col], numOfBin, labels=range(1,numOfBin+1))
    return data
#
binning(df3, 5, 'A')
print()
print(df3)
#
# Binning directly to a specific column
numOfBin = 4
df3['D'] = pd.cut(df3['D'], numOfBin,labels = range(1,numOfBin+1))
print()
print(df3)

      A          B    C         D      E      F
0   181 2021-07-07  1.0 -0.103666   Test  Hello
1   239 2021-07-07  1.0 -0.197135  Train  Hello
2   235 2021-07-07  1.0  1.166115   Test  Hello
3   219 2021-07-07  1.0  2.018895   Test  Hello
4   217 2021-07-07  1.0 -1.503303  Train  Hello
5   123 2021-07-07  1.0  0.098324  Train  Hello
6   260 2021-07-07  1.0 -0.888992  Train  Hello
7   100 2021-07-07  1.0  1.853150   Test  Hello
8   300 2021-07-07  1.0  2.175005  Train  Hello
9   168 2021-07-07  1.0 -1.141212  Train  Hello
10  149 2021-07-07  1.0  2.238630   Test  Hello
11  186 2021-07-07  1.0 -0.321155   Test  Hello

    A          B    C         D      E      F
0   3 2021-07-07  1.0 -0.103666   Test  Hello
1   4 2021-07-07  1.0 -0.197135  Train  Hello
2   4 2021-07-07  1.0  1.166115   Test  Hello
3   3 2021-07-07  1.0  2.018895   Test  Hello
4   3 2021-07-07  1.0 -1.503303  Train  Hello
5   1 2021-07-07  1.0  0.098324  Train  Hello
6   4 2021-07-07  1.0 -0.888992  Train  Hello
7   1 2