# Drop missing rows

In [5]:
import pandas as pd
# Sample dataset with missing values
data = {
    'Feature1': [1, 2, 3, None, 5, 6],
    'Feature2': [7, None, 9, 10, 11, 12],
    'Feature3': ['A', 'B', None, 'C', 'D', 'E'],
}
df = pd.DataFrame(data)
print("Initial dataset:")
print(df)
# Method 1: Removal of Rows with Missing Values
# This method removes rows containing any missing values from the dataset.
df_removed = df.dropna()
print("Dataset after removing rows with missing values:")
print(df_removed)
# Method 2: Listwise Deletion (Removal of Rows with Any Missing Values in Specific Columns)
# In this method, we remove rows with missing values in specific columns.
# Specify the columns where missing values should be considered for removal
columns_to_check = ['Feature1', 'Feature2']
df_listwise_deleted = df.dropna(subset=columns_to_check)
print("\nDataset after listwise deletion based on specific columns:")
print(df_listwise_deleted)


Initial dataset:
   Feature1  Feature2 Feature3
0       1.0       7.0        A
1       2.0       NaN        B
2       3.0       9.0     None
3       NaN      10.0        C
4       5.0      11.0        D
5       6.0      12.0        E
Dataset after removing rows with missing values:
   Feature1  Feature2 Feature3
0       1.0       7.0        A
4       5.0      11.0        D
5       6.0      12.0        E

Dataset after listwise deletion based on specific columns:
   Feature1  Feature2 Feature3
0       1.0       7.0        A
2       3.0       9.0     None
4       5.0      11.0        D
5       6.0      12.0        E


# Fill with mean

In [24]:
import pandas as pd
import numpy as np

# Create a sample dataset with missing values
data = {
    'Feature1': [1, 2, None, 4, 5, 6],
    'Feature2': [7, None, 9, 10, None, 12],
    'Feature3': ['A', 'B', 'C', None, 'D', 'E'],
}

df = pd.DataFrame(data)
print(df)
# Method 1: Mean Imputation
mean_imputed_df = df.copy()
mean_imputed_df['Feature1'].fillna(df['Feature1'].mean(), inplace=True)
mean_imputed_df['Feature2'].fillna(df['Feature2'].mean(), inplace=True)

print("Dataset after mean imputation:")
print(mean_imputed_df)

# Method 2: Median Imputation
median_imputed_df = df.copy()
median_imputed_df['Feature1'].fillna(df['Feature1'].median(), inplace=True)
median_imputed_df['Feature2'].fillna(df['Feature2'].median(), inplace=True)

print("\nDataset after median imputation:")
print(median_imputed_df)

# Method 3: Mode Imputation (for categorical data)
mode_imputed_df = df.copy()
mode_imputed_df['Feature3'].fillna(df['Feature3'].mode(), inplace=True)

print("\nDataset after mode imputation:")
print(mode_imputed_df)


   Feature1  Feature2 Feature3
0       1.0       7.0        A
1       2.0       NaN        B
2       NaN       9.0        C
3       4.0      10.0     None
4       5.0       NaN        D
5       6.0      12.0        E
Dataset after mean imputation:
   Feature1  Feature2 Feature3
0       1.0       7.0        A
1       2.0       9.5        B
2       3.6       9.0        C
3       4.0      10.0     None
4       5.0       9.5        D
5       6.0      12.0        E

Dataset after median imputation:
   Feature1  Feature2 Feature3
0       1.0       7.0        A
1       2.0       9.5        B
2       4.0       9.0        C
3       4.0      10.0     None
4       5.0       9.5        D
5       6.0      12.0        E

Dataset after mode imputation:
   Feature1  Feature2 Feature3
0       1.0       7.0        A
1       2.0       NaN        B
2       NaN       9.0        C
3       4.0      10.0        D
4       5.0       NaN        D
5       6.0      12.0        E


# Forward fill and backward fill in time-series data

In [25]:
import pandas as pd

# Sample time series data with missing values
date_time = pd.Series(['2023-09-01', '2023-09-02', '2023-09-03', '2023-09-04', '2023-09-05', '2023-09-06'])

# Convert 'Date' column to datetime type
df['date'] = pd.to_datetime(date_time)
# Backward fill missing values in 'Value' column
df['Feature2'].fillna(method='bfill', inplace=True)

print(df)


   Feature1  Feature2 Feature3       date
0       1.0       7.0        A 2023-09-01
1       2.0       9.0        B 2023-09-02
2       NaN       9.0        C 2023-09-03
3       4.0      10.0     None 2023-09-04
4       5.0      12.0        D 2023-09-05
5       6.0      12.0        E 2023-09-06


In [19]:
df.to_dict()

{'Feature1': {0: 1.0, 1: 2.0, 2: nan, 3: 4.0, 4: 5.0, 5: 6.0},
 'Feature2': {0: 7.0, 1: 9.0, 2: 9.0, 3: 10.0, 4: 12.0, 5: 12.0},
 'Feature3': {0: 'A', 1: 'B', 2: 'C', 3: None, 4: 'D', 5: 'E'},
 'date': {0: Timestamp('2023-09-01 00:00:00'),
  1: Timestamp('2023-09-02 00:00:00'),
  2: Timestamp('2023-09-03 00:00:00'),
  3: Timestamp('2023-09-04 00:00:00'),
  4: Timestamp('2023-09-05 00:00:00'),
  5: Timestamp('2023-09-06 00:00:00')}}

# IMpute missing values in categorical data

In [26]:
import pandas as pd

# Sample DataFrame with a categorical column containing missing values
# data = {'Category': ['A', 'B', 'A', 'C', None, 'B', 'A', None, 'C', 'A']}
# df = pd.DataFrame(data)

# Impute missing values in the 'Category' column with the most frequent value
# most_frequent_category = df['Feature'].mode()[0]
# df['Category'].fillna(most_frequent_category, inplace=True)
df['Feature3'].fillna('C', inplace=True)

print(df)


   Feature1  Feature2 Feature3       date
0       1.0       7.0        A 2023-09-01
1       2.0       9.0        B 2023-09-02
2       NaN       9.0        C 2023-09-03
3       4.0      10.0        C 2023-09-04
4       5.0      12.0        D 2023-09-05
5       6.0      12.0        E 2023-09-06


In [44]:
import pandas as pd
import numpy as np
from datetime import datetime

# Create a sample dataset
data = {
    'date': [datetime(2023, 9, 1), datetime(2023, 9, 2), datetime(2023, 9, 3), datetime(2023, 9, 4), None, datetime(2023, 9, 6), datetime(2023, 9, 7)],
    'Feature1': [1, 2, None, 4, None, 6, None],
    'Categorical_feature': ['A', 'B', 'C', 'A', None, 'C', None]
}

df = pd.DataFrame(data)

# 1. Remove rows with all missing values
df_cleaned = df.dropna(how='all')

print(df)
print(df_cleaned)
# 2. Fill missing values in 'Feature1' column with the mean
# mean_feature1 = df_cleaned['Feature1'].mean()
# df_cleaned['Feature1'].fillna(mean_feature1, inplace=True)
# print('mean', df_cleaned)

# 3. Forward fill missing values in 'date' column
df_cleaned['Feature1'].fillna(method='ffill', inplace=True)
print('ffill', df_cleaned)

# 4. Fill missing values in 'Categorical_feature' with a specific value (e.g., 'Unknown')
df_cleaned['Categorical_feature'].fillna('Unknown', inplace=True)
print('categorical', df_cleaned)

# Display the cleaned DataFrame
print(df_cleaned)


        date  Feature1 Categorical_feature
0 2023-09-01       1.0                   A
1 2023-09-02       2.0                   B
2 2023-09-03       NaN                   C
3 2023-09-04       4.0                   A
4        NaT       NaN                None
5 2023-09-06       6.0                   C
6 2023-09-07       NaN                None
        date  Feature1 Categorical_feature
0 2023-09-01       1.0                   A
1 2023-09-02       2.0                   B
2 2023-09-03       NaN                   C
3 2023-09-04       4.0                   A
5 2023-09-06       6.0                   C
6 2023-09-07       NaN                None
ffill         date  Feature1 Categorical_feature
0 2023-09-01       1.0                   A
1 2023-09-02       2.0                   B
2 2023-09-03       2.0                   C
3 2023-09-04       4.0                   A
5 2023-09-06       6.0                   C
6 2023-09-07       6.0                None
categorical         date  Feature1 Categorical_f

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Feature1'].fillna(method='ffill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Categorical_feature'].fillna('Unknown', inplace=True)


In [37]:
df

Unnamed: 0,date,Feature1,Categorical_feature
0,2023-09-01,1.0,A
1,2023-09-02,2.0,B
2,2023-09-03,,C
3,NaT,4.0,A
4,NaT,,
5,2023-09-06,6.0,C
6,2023-09-07,,
