# -------------- **Pandas Data Cleaning** --------------

                             "30 January 2024"   - Akanksha              

In [2]:
import pandas as pd

data = {'A': [10, 20, None, 30, 40],
        'B': [None, 'chennai', 'coimbatore', 'london', 'america']}

df = pd.DataFrame(data)


In [4]:
print(df)

      A           B
0  10.0        None
1  20.0     chennai
2   NaN  coimbatore
3  30.0      london
4  40.0     america


## Handling Missing Values:

### Dropping Rows or Columns with Missing Values

Dropping rows with missing values:


In [6]:
clean_df = df.dropna()
print(clean_df)

      A        B
1  20.0  chennai
3  30.0   london
4  40.0  america


Dropping columns with missing values:



In [7]:
clean_df = df.dropna(axis=1)
print(clean_df)

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


Filling missing values of 'A' with the mean of the column:



In [8]:
df['A'].fillna(df['A'].mean(), inplace=True)
print(df)

      A           B
0  10.0        None
1  20.0     chennai
2  25.0  coimbatore
3  30.0      london
4  40.0     america


## Removing Duplicates:

### Removing Duplicate Rows

Removing duplicate rows:


In [9]:
x1 = df.drop_duplicates()
print(x1)

      A           B
0  10.0        None
1  20.0     chennai
2  25.0  coimbatore
3  30.0      london
4  40.0     america


## Data Type Conversion:

### Converting Data Types

Converting 'A' column data types to integers:




In [10]:
df['A'] = df['A'].astype(int)
print(df)

    A           B
0  10        None
1  20     chennai
2  25  coimbatore
3  30      london
4  40     america


## String Cleaning:

### Stripping and Lowercasing Strings

Stripping leading and trailing whitespaces, and converting 'B' column to lowercase:




In [11]:
df['B'] = df['B'].str.strip()
print(df)

    A           B
0  10        None
1  20     chennai
2  25  coimbatore
3  30      london
4  40     america


In [12]:
df['B'] = df['B'].str.lower()
print(df)

    A           B
0  10        None
1  20     chennai
2  25  coimbatore
3  30      london
4  40     america


## Removing Irrelevant Columns:

### Removing and Replacing Columns

Removing 'C' column:






In [None]:
df.drop('C', axis=1, inplace=True)
print(df)

Replacing 'cherry' with 'orange' in 'B' column:


In [14]:
df['B'] = df['B'].replace('cherry', 'orange')
print(df)

    A           B
0  10        None
1  20     chennai
2  25  coimbatore
3  30      london
4  40     america


# Data Transformation

## apply():

### Applying a Function to a Column

Doubling the values in 'A' column using the `apply` method:




In [15]:
def double_value(x):
    return x * 2

df['A_doubled'] = df['A'].apply(double_value)
print(df)

    A           B  A_doubled
0  10        None         20
1  20     chennai         40
2  25  coimbatore         50
3  30      london         60
4  40     america         80


## map():

### Mapping Categories to Numerical Values

Mapping categories in 'Category' column to numerical values:





In [17]:
data = {'Category': ['A', 'B', 'A', 'C', 'B']}
df = pd.DataFrame(data)

category_mapping = {'A': 1, 'B': 2, 'C': 3}

df['Category_Num'] = df['Category'].map(category_mapping)
print(df)

  Category  Category_Num
0        A             1
1        B             2
2        A             1
3        C             3
4        B             2


## applymap():

### Applying Element-wise Function to DataFrame

Squaring all elements in the DataFrame:


In [19]:
data = {'A': [1, 2, 3],
        'B': [4, 5, 6]}
df = pd.DataFrame(data)

def square(x):
    return x ** 2

df_squared = df.applymap(square)
print(df_squared)

   A   B
0  1  16
1  4  25
2  9  36
