In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame(
    [
        [1, 1.0, 0, 1],
        [0, 0.5, None, 2],
        [1, 0.2, None, 3],
        [1, 3.3, 0, 4],
        [0, 5.7, 0, "cat"],
        [1, 0.0, None, 6],
        [0, 1.9, 0, 7],
        [1, 2.4, 0, "dog"],
        [None, None, None, 9],
    ]
)

In [3]:
df

Unnamed: 0,0,1,2,3
0,1.0,1.0,0.0,1
1,0.0,0.5,,2
2,1.0,0.2,,3
3,1.0,3.3,0.0,4
4,0.0,5.7,0.0,cat
5,1.0,0.0,,6
6,0.0,1.9,0.0,7
7,1.0,2.4,0.0,dog
8,,,,9


In [4]:
df.dropna()

Unnamed: 0,0,1,2,3
0,1.0,1.0,0.0,1
3,1.0,3.3,0.0,4
4,0.0,5.7,0.0,cat
6,0.0,1.9,0.0,7
7,1.0,2.4,0.0,dog


## Feature Engineering

In [5]:
df = pd.DataFrame(
    [
        ["cat", 1.0, "3-2021"],
        ["cat", 0.5, "1-2021"],
        ["dog", 0.2, "5-2021"],
        ["bird", 3.3, "3-2021"],
        ["dog", 5.7, "1-2021"],
        ["dog", 0.0, "2-2021"],
        ["cat", 1.9, "4-2021"],
        ["bird", 2.4, "4-2021"],
        ["bird", 2.4, "5-2021"]
    ],
    columns=["animal", "value", "date"]
)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   animal  9 non-null      object 
 1   value   9 non-null      float64
 2   date    9 non-null      object 
dtypes: float64(1), object(2)
memory usage: 348.0+ bytes


## Changing Data Types

In [9]:
df.loc[:, "animal"] = df['animal'].astype('category')

In [12]:
df.loc[:, "animal"]

0     cat
1     cat
2     dog
3    bird
4     dog
5     dog
6     cat
7    bird
8    bird
Name: animal, dtype: object

In [13]:
df.loc[:, "animal"] = df['animal'].astype('category')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   animal  9 non-null      object 
 1   value   9 non-null      float64
 2   date    9 non-null      object 
dtypes: float64(1), object(2)
memory usage: 348.0+ bytes


## Normalizing Data

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()

In [18]:
scaler.fit(df[['value']])

In [19]:
scaler.transform(df[['value']])

array([[-0.54744332],
       [-0.84071653],
       [-1.01668045],
       [ 0.80161343],
       [ 2.20932483],
       [-1.13398974],
       [-0.01955155],
       [ 0.27372166],
       [ 0.27372166]])

## Parsing Date types

In [20]:
pd.to_datetime(df.loc[:, 'date'])

  pd.to_datetime(df.loc[:, 'date'])


0   2021-03-01
1   2021-01-01
2   2021-05-01
3   2021-03-01
4   2021-01-01
5   2021-02-01
6   2021-04-01
7   2021-04-01
8   2021-05-01
Name: date, dtype: datetime64[ns]

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   animal  9 non-null      object 
 1   value   9 non-null      float64
 2   date    9 non-null      object 
dtypes: float64(1), object(2)
memory usage: 348.0+ bytes


## One-Hot Encoding

In [22]:
pd.get_dummies(df.animal, prefix='animal')

Unnamed: 0,animal_bird,animal_cat,animal_dog
0,False,True,False
1,False,True,False
2,False,False,True
3,True,False,False
4,False,False,True
5,False,False,True
6,False,True,False
7,True,False,False
8,True,False,False
