In [1]:
# Import the pandas and the numpy
import pandas as pd
import numpy as np

### Data Structures
- Series - A one dimensional Labelled array
- DataFrames - 2 Dimensional Data Structure

In [2]:
# Series
s = pd.Series([1,2,3, np.nan, 5,6,])

In [3]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
5    6.0
dtype: float64

In [5]:
# Creating DataFrames
# Create a date range dataframe with the pd.date_range()
dates = pd.date_range('20230101', periods = 6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns = list('ABCD'))

In [6]:
# Call the dataframe
df

Unnamed: 0,A,B,C,D
2023-01-01,-0.165206,0.631793,0.429803,1.243914
2023-01-02,-1.929444,-1.288864,0.809636,1.122319
2023-01-03,1.180071,0.27164,-1.006207,0.110209
2023-01-04,0.627476,0.158809,0.220677,-0.060632
2023-01-05,0.324789,0.16043,0.295137,-0.616793
2023-01-06,1.331774,-0.872504,-0.129454,1.337055


## Viewing Data

In [7]:
# How to view the head of the data
# Use the .head() = view the head of the data
df.head()

Unnamed: 0,A,B,C,D
2023-01-01,-0.165206,0.631793,0.429803,1.243914
2023-01-02,-1.929444,-1.288864,0.809636,1.122319
2023-01-03,1.180071,0.27164,-1.006207,0.110209
2023-01-04,0.627476,0.158809,0.220677,-0.060632
2023-01-05,0.324789,0.16043,0.295137,-0.616793


In [8]:
# Specify the number of rows
df.head(3)

Unnamed: 0,A,B,C,D
2023-01-01,-0.165206,0.631793,0.429803,1.243914
2023-01-02,-1.929444,-1.288864,0.809636,1.122319
2023-01-03,1.180071,0.27164,-1.006207,0.110209


In [9]:
## Displaying the tail of the data 
## use .tail()

df.tail()

Unnamed: 0,A,B,C,D
2023-01-02,-1.929444,-1.288864,0.809636,1.122319
2023-01-03,1.180071,0.27164,-1.006207,0.110209
2023-01-04,0.627476,0.158809,0.220677,-0.060632
2023-01-05,0.324789,0.16043,0.295137,-0.616793
2023-01-06,1.331774,-0.872504,-0.129454,1.337055


In [10]:
# Specify the number
df.tail(2)

Unnamed: 0,A,B,C,D
2023-01-05,0.324789,0.16043,0.295137,-0.616793
2023-01-06,1.331774,-0.872504,-0.129454,1.337055


### Displaying Index, Columns and Values

In [11]:
# Display index
df.index

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
# Display the columns in our dataFrame
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [13]:
# Display the values
df.values

array([[-0.1652061 ,  0.63179338,  0.42980333,  1.24391387],
       [-1.92944363, -1.28886357,  0.80963569,  1.12231944],
       [ 1.18007061,  0.27163953, -1.00620724,  0.11020886],
       [ 0.62747584,  0.1588094 ,  0.22067706, -0.06063172],
       [ 0.32478864,  0.16043014,  0.29513716, -0.61679348],
       [ 1.33177382, -0.87250352, -0.12945365,  1.33705535]])

### Describe
- It gives the summary statistics


In [14]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.228243,-0.156449,0.103265,0.522679
std,1.191879,0.748262,0.623101,0.818748
min,-1.929444,-1.288864,-1.006207,-0.616793
25%,-0.042707,-0.614675,-0.041921,-0.017922
50%,0.476132,0.15962,0.257907,0.616264
75%,1.041922,0.243837,0.396137,1.213515
max,1.331774,0.631793,0.809636,1.337055


### Getting information about your DataFrame

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2023-01-01 to 2023-01-06
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 240.0 bytes


## Selecting Data

### Selecting a single column

In [16]:
df['A'] # This will select column A

2023-01-01   -0.165206
2023-01-02   -1.929444
2023-01-03    1.180071
2023-01-04    0.627476
2023-01-05    0.324789
2023-01-06    1.331774
Freq: D, Name: A, dtype: float64

In [17]:
# Select Column B
df['B']

2023-01-01    0.631793
2023-01-02   -1.288864
2023-01-03    0.271640
2023-01-04    0.158809
2023-01-05    0.160430
2023-01-06   -0.872504
Freq: D, Name: B, dtype: float64

### Selecting Multiple Columns

In [18]:
# Select column B and C
df[['B', 'C']]

Unnamed: 0,B,C
2023-01-01,0.631793,0.429803
2023-01-02,-1.288864,0.809636
2023-01-03,0.27164,-1.006207
2023-01-04,0.158809,0.220677
2023-01-05,0.16043,0.295137
2023-01-06,-0.872504,-0.129454


### Selecting Rows by Label

In [20]:
# Select the row of 20230103
df.loc['20230103']

A    1.180071
B    0.271640
C   -1.006207
D    0.110209
Name: 2023-01-03 00:00:00, dtype: float64

### Selecting by Position

In [21]:
# Select the 4th row
df.iloc[3]

A    0.627476
B    0.158809
C    0.220677
D   -0.060632
Name: 2023-01-04 00:00:00, dtype: float64

In [22]:
# Selecting rows 2-3, columns 1-2
df.iloc[1:3, 0:2]

Unnamed: 0,A,B
2023-01-02,-1.929444,-1.288864
2023-01-03,1.180071,0.27164


# Data Manipulation

In [23]:
df

Unnamed: 0,A,B,C,D
2023-01-01,-0.165206,0.631793,0.429803,1.243914
2023-01-02,-1.929444,-1.288864,0.809636,1.122319
2023-01-03,1.180071,0.27164,-1.006207,0.110209
2023-01-04,0.627476,0.158809,0.220677,-0.060632
2023-01-05,0.324789,0.16043,0.295137,-0.616793
2023-01-06,1.331774,-0.872504,-0.129454,1.337055


### Assigning new columns to our data

In [24]:
# Assign column E Given by adding column A and B
df['E'] = df['A'] + df['B']

In [25]:
df

Unnamed: 0,A,B,C,D,E
2023-01-01,-0.165206,0.631793,0.429803,1.243914,0.466587
2023-01-02,-1.929444,-1.288864,0.809636,1.122319,-3.218307
2023-01-03,1.180071,0.27164,-1.006207,0.110209,1.45171
2023-01-04,0.627476,0.158809,0.220677,-0.060632,0.786285
2023-01-05,0.324789,0.16043,0.295137,-0.616793,0.485219
2023-01-06,1.331774,-0.872504,-0.129454,1.337055,0.45927


In [26]:
## create a new column with new values
df['F'] = [1,2,3,4,5,6]

In [27]:
df

Unnamed: 0,A,B,C,D,E,F
2023-01-01,-0.165206,0.631793,0.429803,1.243914,0.466587,1
2023-01-02,-1.929444,-1.288864,0.809636,1.122319,-3.218307,2
2023-01-03,1.180071,0.27164,-1.006207,0.110209,1.45171,3
2023-01-04,0.627476,0.158809,0.220677,-0.060632,0.786285,4
2023-01-05,0.324789,0.16043,0.295137,-0.616793,0.485219,5
2023-01-06,1.331774,-0.872504,-0.129454,1.337055,0.45927,6


### Dropping columns

In [28]:
df.drop(columns=['C'])

Unnamed: 0,A,B,D,E,F
2023-01-01,-0.165206,0.631793,1.243914,0.466587,1
2023-01-02,-1.929444,-1.288864,1.122319,-3.218307,2
2023-01-03,1.180071,0.27164,0.110209,1.45171,3
2023-01-04,0.627476,0.158809,-0.060632,0.786285,4
2023-01-05,0.324789,0.16043,-0.616793,0.485219,5
2023-01-06,1.331774,-0.872504,1.337055,0.45927,6


In [30]:
df.drop(columns=['C'])

Unnamed: 0,A,B,D,E,F
2023-01-01,-0.165206,0.631793,1.243914,0.466587,1
2023-01-02,-1.929444,-1.288864,1.122319,-3.218307,2
2023-01-03,1.180071,0.27164,0.110209,1.45171,3
2023-01-04,0.627476,0.158809,-0.060632,0.786285,4
2023-01-05,0.324789,0.16043,-0.616793,0.485219,5
2023-01-06,1.331774,-0.872504,1.337055,0.45927,6


In [31]:
df

Unnamed: 0,A,B,C,D,E,F
2023-01-01,-0.165206,0.631793,0.429803,1.243914,0.466587,1
2023-01-02,-1.929444,-1.288864,0.809636,1.122319,-3.218307,2
2023-01-03,1.180071,0.27164,-1.006207,0.110209,1.45171,3
2023-01-04,0.627476,0.158809,0.220677,-0.060632,0.786285,4
2023-01-05,0.324789,0.16043,0.295137,-0.616793,0.485219,5
2023-01-06,1.331774,-0.872504,-0.129454,1.337055,0.45927,6


In [32]:
# Filter the data
# Filter the data to give values in column A greater than 0
df[df['A'] > 0]

Unnamed: 0,A,B,C,D,E,F
2023-01-03,1.180071,0.27164,-1.006207,0.110209,1.45171,3
2023-01-04,0.627476,0.158809,0.220677,-0.060632,0.786285,4
2023-01-05,0.324789,0.16043,0.295137,-0.616793,0.485219,5
2023-01-06,1.331774,-0.872504,-0.129454,1.337055,0.45927,6


## Missing Data

In [33]:
## Detect missing valuues
df.isnull()

Unnamed: 0,A,B,C,D,E,F
2023-01-01,False,False,False,False,False,False
2023-01-02,False,False,False,False,False,False
2023-01-03,False,False,False,False,False,False
2023-01-04,False,False,False,False,False,False
2023-01-05,False,False,False,False,False,False
2023-01-06,False,False,False,False,False,False


In [34]:
df.isna()

Unnamed: 0,A,B,C,D,E,F
2023-01-01,False,False,False,False,False,False
2023-01-02,False,False,False,False,False,False
2023-01-03,False,False,False,False,False,False
2023-01-04,False,False,False,False,False,False
2023-01-05,False,False,False,False,False,False
2023-01-06,False,False,False,False,False,False


In [35]:
df['G'] = [4, np.nan, 5, np.nan, np.nan, 9]

In [36]:
df

Unnamed: 0,A,B,C,D,E,F,G
2023-01-01,-0.165206,0.631793,0.429803,1.243914,0.466587,1,4.0
2023-01-02,-1.929444,-1.288864,0.809636,1.122319,-3.218307,2,
2023-01-03,1.180071,0.27164,-1.006207,0.110209,1.45171,3,5.0
2023-01-04,0.627476,0.158809,0.220677,-0.060632,0.786285,4,
2023-01-05,0.324789,0.16043,0.295137,-0.616793,0.485219,5,
2023-01-06,1.331774,-0.872504,-0.129454,1.337055,0.45927,6,9.0


In [37]:
df.isnull()

Unnamed: 0,A,B,C,D,E,F,G
2023-01-01,False,False,False,False,False,False,False
2023-01-02,False,False,False,False,False,False,True
2023-01-03,False,False,False,False,False,False,False
2023-01-04,False,False,False,False,False,False,True
2023-01-05,False,False,False,False,False,False,True
2023-01-06,False,False,False,False,False,False,False


In [38]:
df.isna()

Unnamed: 0,A,B,C,D,E,F,G
2023-01-01,False,False,False,False,False,False,False
2023-01-02,False,False,False,False,False,False,True
2023-01-03,False,False,False,False,False,False,False
2023-01-04,False,False,False,False,False,False,True
2023-01-05,False,False,False,False,False,False,True
2023-01-06,False,False,False,False,False,False,False


In [39]:
# Fill missing values - Use fillna()
# Fill the missing values with 10
df.fillna(value=10)

Unnamed: 0,A,B,C,D,E,F,G
2023-01-01,-0.165206,0.631793,0.429803,1.243914,0.466587,1,4.0
2023-01-02,-1.929444,-1.288864,0.809636,1.122319,-3.218307,2,10.0
2023-01-03,1.180071,0.27164,-1.006207,0.110209,1.45171,3,5.0
2023-01-04,0.627476,0.158809,0.220677,-0.060632,0.786285,4,10.0
2023-01-05,0.324789,0.16043,0.295137,-0.616793,0.485219,5,10.0
2023-01-06,1.331774,-0.872504,-0.129454,1.337055,0.45927,6,9.0


In [40]:
df

Unnamed: 0,A,B,C,D,E,F,G
2023-01-01,-0.165206,0.631793,0.429803,1.243914,0.466587,1,4.0
2023-01-02,-1.929444,-1.288864,0.809636,1.122319,-3.218307,2,
2023-01-03,1.180071,0.27164,-1.006207,0.110209,1.45171,3,5.0
2023-01-04,0.627476,0.158809,0.220677,-0.060632,0.786285,4,
2023-01-05,0.324789,0.16043,0.295137,-0.616793,0.485219,5,
2023-01-06,1.331774,-0.872504,-0.129454,1.337055,0.45927,6,9.0


In [41]:
## Dropping missing values
df.dropna()

Unnamed: 0,A,B,C,D,E,F,G
2023-01-01,-0.165206,0.631793,0.429803,1.243914,0.466587,1,4.0
2023-01-03,1.180071,0.27164,-1.006207,0.110209,1.45171,3,5.0
2023-01-06,1.331774,-0.872504,-0.129454,1.337055,0.45927,6,9.0


In [42]:
df

Unnamed: 0,A,B,C,D,E,F,G
2023-01-01,-0.165206,0.631793,0.429803,1.243914,0.466587,1,4.0
2023-01-02,-1.929444,-1.288864,0.809636,1.122319,-3.218307,2,
2023-01-03,1.180071,0.27164,-1.006207,0.110209,1.45171,3,5.0
2023-01-04,0.627476,0.158809,0.220677,-0.060632,0.786285,4,
2023-01-05,0.324789,0.16043,0.295137,-0.616793,0.485219,5,
2023-01-06,1.331774,-0.872504,-0.129454,1.337055,0.45927,6,9.0


In [44]:
df = df.drop(columns=['G'])

In [45]:
df

Unnamed: 0,A,B,C,D,E,F
2023-01-01,-0.165206,0.631793,0.429803,1.243914,0.466587,1
2023-01-02,-1.929444,-1.288864,0.809636,1.122319,-3.218307,2
2023-01-03,1.180071,0.27164,-1.006207,0.110209,1.45171,3
2023-01-04,0.627476,0.158809,0.220677,-0.060632,0.786285,4
2023-01-05,0.324789,0.16043,0.295137,-0.616793,0.485219,5
2023-01-06,1.331774,-0.872504,-0.129454,1.337055,0.45927,6


# Operations

In [46]:
# Find the mean
df.mean()

A    0.228243
B   -0.156449
C    0.103265
D    0.522679
E    0.071794
F    3.500000
dtype: float64

In [47]:
# Applying functions
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,E,F
2023-01-01,-0.165206,0.631793,0.429803,1.243914,0.466587,1
2023-01-02,-2.09465,-0.65707,1.239439,2.366233,-2.75172,3
2023-01-03,-0.914579,-0.385431,0.233232,2.476442,-1.30001,6
2023-01-04,-0.287103,-0.226621,0.453909,2.41581,-0.513725,10
2023-01-05,0.037685,-0.066191,0.749046,1.799017,-0.028506,15
2023-01-06,1.369459,-0.938695,0.619592,3.136072,0.430765,21


In [48]:
df.apply(lambda x: x.max() - x.min())

A    3.261217
B    1.920657
C    1.815843
D    1.953849
E    4.670017
F    5.000000
dtype: float64