<a href="https://colab.research.google.com/github/Ananya22-ux/OASIS-INFOBYTE/blob/main/Cleaning_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 Import Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


Load Sample Data
Assume we have a sample dataset df loaded from a CSV file or any other source.

In [None]:
# Example: Loading sample data (replace with your actual data loading code)
data = {
    'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Age': [25, 30, np.nan, 35, 40, 45, 50, np.nan, 60, 70],
    'Income': [50000, 60000, 75000, 90000, 80000, np.nan, 95000, 110000, 105000, 120000],
    'Score': [85, 88, 82, 92, 89, 90, 78, 85, 95, 88]
}

df = pd.DataFrame(data)


Handling Missing Data

In [None]:
# Display initial info and missing values
print("Initial dataframe info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())

# Handle missing values by imputing with mean or median
# Impute Age with median, Income with mean
median_age = df['Age'].median()
mean_income = df['Income'].mean()
df['Age'].fillna(median_age, inplace=True)
df['Income'].fillna(mean_income, inplace=True)

print("\nAfter handling missing values:")
print(df.isnull().sum())


Initial dataframe info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      10 non-null     int64  
 1   Age     8 non-null      float64
 2   Income  9 non-null      float64
 3   Score   10 non-null     int64  
dtypes: float64(2), int64(2)
memory usage: 448.0 bytes
None

Missing values:
ID        0
Age       2
Income    1
Score     0
dtype: int64

After handling missing values:
ID        0
Age       0
Income    0
Score     0
dtype: int64



Duplicate Removal

In [None]:
# Check for duplicates
duplicate_rows = df[df.duplicated()]
print("\nDuplicate rows:")
print(duplicate_rows)

# Drop duplicates
df.drop_duplicates(inplace=True)
print("\nAfter removing duplicates:")
print(df)



Duplicate rows:
Empty DataFrame
Columns: [ID, Age, Income, Score]
Index: []

After removing duplicates:
   ID   Age         Income  Score
0   1  25.0   50000.000000     85
1   2  30.0   60000.000000     88
2   3  42.5   75000.000000     82
3   4  35.0   90000.000000     92
4   5  40.0   80000.000000     89
5   6  45.0   87222.222222     90
6   7  50.0   95000.000000     78
7   8  42.5  110000.000000     85
8   9  60.0  105000.000000     95
9  10  70.0  120000.000000     88


Standardization

In [None]:
# Standardize numeric columns (Age, Income, Score) using StandardScaler
scaler = StandardScaler()
numeric_cols = ['Age', 'Income', 'Score']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print("\nAfter standardization:")
print(df)



After standardization:
   ID       Age    Income     Score
0   1 -1.489337 -1.793857 -0.471621
1   2 -1.097407 -1.311925  0.171499
2   3 -0.117579 -0.589028 -1.114741
3   4 -0.705476  0.133870  1.028992
4   5 -0.313545 -0.348062  0.385872
5   6  0.078386  0.000000  0.600245
6   7  0.470317  0.374836 -1.972234
7   8 -0.117579  1.097733 -0.471621
8   9  1.254179  0.856767  1.672111
9  10  2.038041  1.579665  0.171499


Outlier Detection (Optional)

In [None]:
# Detect and handle outliers (example using IQR method)
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# Example: Remove outliers in 'Income'
df = remove_outliers(df, 'Income')

print("\nAfter removing outliers:")
print(df)



After removing outliers:
   ID       Age    Income     Score
0   1 -1.489337 -1.793857 -0.471621
1   2 -1.097407 -1.311925  0.171499
2   3 -0.117579 -0.589028 -1.114741
3   4 -0.705476  0.133870  1.028992
4   5 -0.313545 -0.348062  0.385872
5   6  0.078386  0.000000  0.600245
6   7  0.470317  0.374836 -1.972234
7   8 -0.117579  1.097733 -0.471621
8   9  1.254179  0.856767  1.672111
9  10  2.038041  1.579665  0.171499
