### Data Cleaning

##### Drop Duplicate Rows

In [147]:
import pandas as pd

In [68]:
# Sample DataFrame
data = {
    'col1': [1, 2, 2, 3, 4, 4, 5],
    'col2': ['A', 'B', 'B', 'C', 'E', 'E', 'F'],
    'col3': ['X', 'Y', 'Y', 'Z', 'S', 'S', 'V']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

Original DataFrame:
   col1 col2 col3
0     1    A    X
1     2    B    Y
2     2    B    Y
3     3    C    Z
4     4    E    S
5     4    E    S
6     5    F    V


In [70]:
# Drop duplicate rows (considering all columns)
df_cleaned = df.drop_duplicates()
print("\nDataFrame after dropping duplicates (all columns):")
print(df_cleaned)


DataFrame after dropping duplicates (all columns):
   col1 col2 col3
0     1    A    X
1     2    B    Y
3     3    C    Z
4     4    E    S
6     5    F    V


In [86]:
# Drop duplicates based on 'col1' 
df_col1_unique = df.drop_duplicates(subset=['col1'])
print("\nDataFrame after dropping duplicates (based on 'col1'):")
print(df_col1_unique)


DataFrame after dropping duplicates (based on 'col1'):
   col1 col2 col3
0     1    A    X
1     2    B    Y
3     3    C    Z
4     4    E    S
6     5    F    V


In [72]:
# Drop duplicates based on 'col1' and 'col2'
df_col1_col2_unique = df.drop_duplicates(subset=['col1', 'col2'])
print("\nDataFrame after dropping duplicates (based on 'col1' and 'col2'):")
print(df_col1_col2_unique)


DataFrame after dropping duplicates (based on 'col1' and 'col2'):
   col1 col2 col3
0     1    A    X
1     2    B    Y
3     3    C    Z
4     4    E    S
6     5    F    V


In [74]:
# Keep the last occurrence of duplicates
df_keep_last = df.drop_duplicates(keep='last')
print("\nDataFrame after dropping duplicates (keep last):")
print(df_keep_last)


DataFrame after dropping duplicates (keep last):
   col1 col2 col3
0     1    A    X
2     2    B    Y
3     3    C    Z
5     4    E    S
6     5    F    V


In [76]:
# Drop all occurrences of duplicates
df_drop_all = df.drop_duplicates(keep=False)   # the source DataFrame is changed and None is returned
print("\nDataFrame after dropping duplicates (drop all occurrences):")
print(df_drop_all)


DataFrame after dropping duplicates (drop all occurrences):
   col1 col2 col3
0     1    A    X
3     3    C    Z
6     5    F    V


In [78]:
# Modify DataFrame in place
print("\nOriginal DataFrame before inplace drop:")
print(df)
df.drop_duplicates(inplace=True)
print("\nDataFrame after inplace drop:")
print(df)


Original DataFrame before inplace drop:
   col1 col2 col3
0     1    A    X
1     2    B    Y
2     2    B    Y
3     3    C    Z
4     4    E    S
5     4    E    S
6     5    F    V

DataFrame after inplace drop:
   col1 col2 col3
0     1    A    X
1     2    B    Y
3     3    C    Z
4     4    E    S
6     5    F    V


In [80]:
df

Unnamed: 0,col1,col2,col3
0,1,A,X
1,2,B,Y
3,3,C,Z
4,4,E,S
6,5,F,V


In [84]:
print("\nBoolean Series indicating duplicate rows:")
print(df.duplicated())
print('-'*30)
# To see the actual duplicate rows (before dropping them)
print("\nActual duplicate rows:")
print(df[df.duplicated(keep=False)]) # keep=False shows all duplicates


Boolean Series indicating duplicate rows:
0    False
1    False
3    False
4    False
6    False
dtype: bool
------------------------------

Actual duplicate rows:
Empty DataFrame
Columns: [col1, col2, col3]
Index: []


##### Drop Missing Data

In [90]:
data = {'A': [1, 2, np.nan, 4, 5],
        'B': [6, np.nan, 8, 9, 10],
        'C': [12, 24, 45, np.nan, 15],
        'D': [16, 17, 56, 19, np.nan],
        'E': [21, 22, 21, 24, 25],
        'F': [np.nan, np.nan, np.nan, np.nan, np.nan]}  

NameError: name 'np' is not defined

In [94]:
import numpy as np

In [96]:
data = {'A': [1, 2, np.nan, 4, 5],
        'B': [6, np.nan, 8, 9, 10],
        'C': [12, 24, 45, np.nan, 15],
        'D': [16, 17, 56, 19, np.nan],
        'E': [21, 22, 21, 24, 25],
        'F': [np.nan, np.nan, np.nan, np.nan, np.nan]}  

In [100]:
df_missing_data = pd.DataFrame(data)

In [106]:
print("Original DataFrame:")
print(df_missing_data)

Original DataFrame:
     A     B     C     D   E   F
0  1.0   6.0  12.0  16.0  21 NaN
1  2.0   NaN  24.0  17.0  22 NaN
2  NaN   8.0  45.0  56.0  21 NaN
3  4.0   9.0   NaN  19.0  24 NaN
4  5.0  10.0  15.0   NaN  25 NaN


In [114]:
# 1. Dropping rows with nan
df_rows_dropped_any = df_missing_data.dropna() 
df_rows_dropped_any

Unnamed: 0,A,B,C,D,E,F


In [116]:
print("\nDataFrame after dropping rows with ANY NaN:")
print(df_rows_dropped_any)


DataFrame after dropping rows with ANY NaN:
Empty DataFrame
Columns: [A, B, C, D, E, F]
Index: []


##### Modify Columns

In [121]:
# Creating dataframe 
data = {
    'Name': ['Ashu', 'Sanju', 'Charles', 'Dhavan', 'Eve'],
    'Age': [25, 30, 35, 28, 22],
    'City': ['New York', 'London', 'Paris', 'Tokyo', 'Sydney'],
    'Salary': [50000, 60000, 75000, 55000, 48000]
}
df3 = pd.DataFrame(data)
print("Original DataFrame:")
print(df3)
print("-" * 30)

Original DataFrame:
      Name  Age      City  Salary
0     Ashu   25  New York   50000
1    Sanju   30    London   60000
2  Charles   35     Paris   75000
3   Dhavan   28     Tokyo   55000
4      Eve   22    Sydney   48000
------------------------------


In [123]:
# 1. Renaming Columns
df3_renamed_single = df3.rename(columns={'Age': 'YearsOld'})
df3_renamed_single

Unnamed: 0,Name,YearsOld,City,Salary
0,Ashu,25,New York,50000
1,Sanju,30,London,60000
2,Charles,35,Paris,75000
3,Dhavan,28,Tokyo,55000
4,Eve,22,Sydney,48000


In [131]:
df3_renamed_multiple = df3.rename(columns={'Name': 'Fullname', 'city': 'Location'})
df3_renamed_multiple

Unnamed: 0,Fullname,Age,City,Salary
0,Ashu,25,New York,50000
1,Sanju,30,London,60000
2,Charles,35,Paris,75000
3,Dhavan,28,Tokyo,55000
4,Eve,22,Sydney,48000


In [133]:
# Rename in-place 
df3.rename(columns={'Salary': 'Income'}, inplace=True)
print("Original DataFrame after in-place rename of 'Salary':")
print(df3)
print("-" * 30)

Original DataFrame after in-place rename of 'Salary':
      Name  Age      City  Income
0     Ashu   25  New York   50000
1    Sanju   30    London   60000
2  Charles   35     Paris   75000
3   Dhavan   28     Tokyo   55000
4      Eve   22    Sydney   48000
------------------------------


In [145]:
# 2. Assigning a new list to df.columns (Recommended for renaming all columns)
# Create DataFrame 
data2 = {
    'test1': [1, 2, 3],
    'test2': [4, 5, 6]
}
data2

df4 = pd.Dataframe(data2) 
df4

AttributeError: module 'pandas' has no attribute 'Dataframe'

In [149]:
import pandas as pd

In [155]:
# Create DataFrame 
data_2 = {
    'test1': [1, 2, 3],
    'test2': [4, 5, 6]
}
data_2


{'test1': [1, 2, 3], 'test2': [4, 5, 6]}

In [165]:
data_all = {
     'test1': [1, 2, 3],
    'test2': [4, 5, 6]
}
df4 = pd.DataFrame(data_all)
df4

Unnamed: 0,test1,test2
0,1,4
1,2,5
2,3,6


In [169]:
df4.columns=['Column1', 'Column2']
print("DataFrame after renaming all columns:")
df4

DataFrame after renaming all columns:


Unnamed: 0,Column1,Column2
0,1,4
1,2,5
2,3,6


In [None]:
# 3. Adding Columns
