### Data Cleaning

##### Drop Duplicate Rows

In [4]:
import pandas as pd

In [68]:
# Sample DataFrame
data = {
    'col1': [1, 2, 2, 3, 4, 4, 5],
    'col2': ['A', 'B', 'B', 'C', 'E', 'E', 'F'],
    'col3': ['X', 'Y', 'Y', 'Z', 'S', 'S', 'V']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

Original DataFrame:
   col1 col2 col3
0     1    A    X
1     2    B    Y
2     2    B    Y
3     3    C    Z
4     4    E    S
5     4    E    S
6     5    F    V


In [70]:
# Drop duplicate rows (considering all columns)
df_cleaned = df.drop_duplicates()
print("\nDataFrame after dropping duplicates (all columns):")
print(df_cleaned)


DataFrame after dropping duplicates (all columns):
   col1 col2 col3
0     1    A    X
1     2    B    Y
3     3    C    Z
4     4    E    S
6     5    F    V


In [14]:
# Drop duplicates based on 'col1' only
df_col1_unique = df.drop_duplicates(subset=['col1'])
print("\nDataFrame after dropping duplicates (based on 'col1'):")
print(df_col1_unique)


DataFrame after dropping duplicates (based on 'col1'):
   col1 col2 col3
0     1    A    X
1     2    B    Y
2     4    B    Y
3     3    C    Z
6     5    E    V


In [72]:
# Drop duplicates based on 'col1' and 'col2'
df_col1_col2_unique = df.drop_duplicates(subset=['col1', 'col2'])
print("\nDataFrame after dropping duplicates (based on 'col1' and 'col2'):")
print(df_col1_col2_unique)


DataFrame after dropping duplicates (based on 'col1' and 'col2'):
   col1 col2 col3
0     1    A    X
1     2    B    Y
3     3    C    Z
4     4    E    S
6     5    F    V


In [74]:
# Keep the last occurrence of duplicates
df_keep_last = df.drop_duplicates(keep='last')
print("\nDataFrame after dropping duplicates (keep last):")
print(df_keep_last)


DataFrame after dropping duplicates (keep last):
   col1 col2 col3
0     1    A    X
2     2    B    Y
3     3    C    Z
5     4    E    S
6     5    F    V


In [76]:
# Drop all occurrences of duplicates
df_drop_all = df.drop_duplicates(keep=False)   # the source DataFrame is changed and None is returned
print("\nDataFrame after dropping duplicates (drop all occurrences):")
print(df_drop_all)


DataFrame after dropping duplicates (drop all occurrences):
   col1 col2 col3
0     1    A    X
3     3    C    Z
6     5    F    V


In [78]:
# Modify DataFrame in place
print("\nOriginal DataFrame before inplace drop:")
print(df)
df.drop_duplicates(inplace=True)
print("\nDataFrame after inplace drop:")
print(df)


Original DataFrame before inplace drop:
   col1 col2 col3
0     1    A    X
1     2    B    Y
2     2    B    Y
3     3    C    Z
4     4    E    S
5     4    E    S
6     5    F    V

DataFrame after inplace drop:
   col1 col2 col3
0     1    A    X
1     2    B    Y
3     3    C    Z
4     4    E    S
6     5    F    V


In [80]:
df

Unnamed: 0,col1,col2,col3
0,1,A,X
1,2,B,Y
3,3,C,Z
4,4,E,S
6,5,F,V


In [84]:
print("\nBoolean Series indicating duplicate rows:")
print(df.duplicated())
print('-'*30)
# To see the actual duplicate rows (before dropping them)
print("\nActual duplicate rows:")
print(df[df.duplicated(keep=False)]) # keep=False shows all duplicates


Boolean Series indicating duplicate rows:
0    False
1    False
3    False
4    False
6    False
dtype: bool
------------------------------

Actual duplicate rows:
Empty DataFrame
Columns: [col1, col2, col3]
Index: []
