# REPORT ON HOW TO CLEAN DATA USING PANDAS FUNCTION

# In Python, there are several functions and methods that can be used to clean and preprocess data. Here are some examples:

In [1]:
import pandas as pd

In [2]:
import numpy as np 

In [3]:
# We will now import a sample raw data and perform data cleaning methods on it:

In [5]:
df = pd.read_csv("D:\\Ed-byte\\python\\Attrition.csv")

# 1. Handling missing values:

In [7]:
# Let us replace missing values with the mean of the column

In [9]:
df['Age'].fillna(df['Age'].mean(), inplace=True)

In [10]:
# Let us now drop rows with missing values

In [11]:
df.dropna(inplace=True)

# 2. Removing the duplicates:

In [12]:
# We will now drop the duplicate rows based on all columns:

In [13]:
df.drop_duplicates(inplace=True)

In [14]:
# We will now drop the duplicate rows based on specific columns:

In [15]:
df.drop_duplicates(subset=['DailyRate', 'Department'], inplace=True)

# 3. Normalizing data:

In [19]:
# We will now normalize the data in following way:

In [20]:
from sklearn.preprocessing import MinMaxScaler

In [21]:
scaler = MinMaxScaler()
df[['DailyRate', 'DistanceFromHome']] = scaler.fit_transform(df[['DailyRate', 'DistanceFromHome']])

# 4. Converting data types:

In [22]:
# Let us now convert a column to numeric type:

In [24]:
df['DailyRate'] = pd.to_numeric(df['DailyRate'], errors='coerce')

In [28]:
# Note: We can also convert a column into datetime type by using the following syntax:
# df['date_column'] = pd.to_datetime(df['date_column'])

# 5. Renaming columns:

In [29]:
# We will now rename multiple column at same time using the following syntax:

In [30]:
df = df.rename(columns={'BusinessTravel': 'BusinessTrips', 'EducationField': 'Stream'})