# Data Cleaning & Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    "Age": [10, 20, np.nan, 24, np.nan, 18],
    "Math_Score": [88, np.nan, 80, 90, 88, np.nan],
    "Department": ["CSE", "CSE", np.nan, "ECE", "ME", "ECE"]
}

df = pd.DataFrame(data)
print("Original Data:")
print(df)

Original Data:
    Age  Math_Score Department
0  10.0        88.0        CSE
1  20.0         NaN        CSE
2   NaN        80.0        NaN
3  24.0        90.0        ECE
4   NaN        88.0         ME
5  18.0         NaN        ECE


## 1. Mean / Median / Mode Imputation

In [4]:
df["Math_Score"].fillna(df["Math_Score"].mean(), inplace=True)

df["Age"].fillna(df["Age"].median(), inplace=True)

df["Department"].fillna(df["Department"].mode()[0], inplace=True)

display(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Math_Score"].fillna(df["Math_Score"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

Unnamed: 0,Age,Math_Score,Department
0,10.0,88.0,CSE
1,20.0,86.5,CSE
2,19.0,80.0,CSE
3,24.0,90.0,ECE
4,19.0,88.0,ME
5,18.0,86.5,ECE


## 2. Forward / Backward Fill

In [5]:
# Forward fill
df_ffill = df.copy()
df_ffill["Age"] = df_ffill["Age"].ffill()

# Backward fill 
df_bfill = df.copy()
df_bfill["Age"] = df_bfill["Age"].bfill()

print("Forward Fill Example:\n")
display(df_ffill)

print("\nBackward Fill Example:")
display(df_bfill)


Forward Fill Example:



Unnamed: 0,Age,Math_Score,Department
0,10.0,88.0,CSE
1,20.0,86.5,CSE
2,19.0,80.0,CSE
3,24.0,90.0,ECE
4,19.0,88.0,ME
5,18.0,86.5,ECE



Backward Fill Example:


Unnamed: 0,Age,Math_Score,Department
0,10.0,88.0,CSE
1,20.0,86.5,CSE
2,19.0,80.0,CSE
3,24.0,90.0,ECE
4,19.0,88.0,ME
5,18.0,86.5,ECE


## 3. Regression / KNN Imputation (Advanced)

In [6]:
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression

# Example using KNN Imputer
knn_df = pd.DataFrame({
    "Age": [20, 21, np.nan, 23, 24, np.nan],
    "Math_Score": [78, np.nan, 85, 90, 92, 88]
})

imputer = KNNImputer(n_neighbors=2)
knn_imputed = imputer.fit_transform(knn_df)
knn_df = pd.DataFrame(knn_imputed, columns=knn_df.columns)

print("\nKNN Imputation Example:\n")
display(knn_df)





KNN Imputation Example:



Unnamed: 0,Age,Math_Score
0,20.0,78.0
1,21.0,84.0
2,21.5,85.0
3,23.0,90.0
4,24.0,92.0
5,23.5,88.0


## Using Regression for Age ~ Math_Score

In [7]:
reg_df = pd.DataFrame({
    "Age": [ 21, np.nan, 23, np.nan, 22],
    "Math_Score": [ 80, 85, 90, 88, 92]
})

train = reg_df[reg_df["Age"].notna()]
test = reg_df[reg_df["Age"].isna()]

lr = LinearRegression()
lr.fit(train[["Math_Score"]], train["Age"])

reg_df.loc[reg_df["Age"].isna(), "Age"] = lr.predict(test[["Math_Score"]])

print("\nRegression Imputation Example:\n")
display(reg_df)


Regression Imputation Example:



Unnamed: 0,Age,Math_Score
0,21.0,80
1,21.717742,85
2,23.0,90
3,22.080645,88
4,22.0,92


## Outlier Handling Approaches

In [9]:
import pandas as pd
import numpy as np

data = {
    "Salary": [ 27000, 30000, 28000, 26000, 25500, 27000, 950000, 31000, 29500]
}
df = pd.DataFrame(data)
print("Original Data:\n")
display(df)


Original Data:



Unnamed: 0,Salary
0,27000
1,30000
2,28000
3,26000
4,25500
5,27000
6,950000
7,31000
8,29500


## 1. Detecting Outliers – Z-score Method  Z=(X-M)/sigma 
## Outlier if Z > 3 or Z < -3

In [10]:
from scipy import stats

df["Z_score"] = stats.zscore(df["Salary"])

outliers_z = df[df["Z_score"].abs() > 3]
print("\nOutliers detected by Z-score:\n")
display(outliers_z)



Outliers detected by Z-score:



Unnamed: 0,Salary,Z_score


## Detecting Outliers – IQR Method IQR=Q3−Q1 Lower Bound=Q1−1.5×IQR Upper Bound=Q3+1.5×IQR A data point  x  is considered an outlier if: x<Q1−1.5×IQRorx>Q3+1.5×IQR

In [12]:
Q1 = df["Salary"].quantile(0.25)
Q3 = df["Salary"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers_iqr = df[(df["Salary"] < lower_bound) | (df["Salary"] > upper_bound)]
print("\nOutliers detected by IQR:\n")
display(outliers_iqr)



Outliers detected by IQR:



Unnamed: 0,Salary,Z_score
6,950000,2.828376


##  Handling Outliers Options:

## Remove them

## Cap values within thresholds

## Transform (log/sqrt)

## Option 1: Remove outliers


In [13]:
df_removed = df[(df["Salary"] >= lower_bound) & (df["Salary"] <= upper_bound)]

## Option 2: Cap values

In [14]:
df_capped = df.copy()
df_capped["Salary"] = np.where(df_capped["Salary"] > upper_bound, upper_bound,
                               np.where(df_capped["Salary"] < lower_bound, lower_bound, df_capped["Salary"]))



## Option 3: Log Transform

In [15]:
df_log = df.copy()
df_log["Salary"] = np.log(df_log["Salary"])

print("\nAfter Removing Outliers:\n")
display(df_removed)

print("\nAfter Capping Outliers:\n")
display(df_capped)

print("\nAfter Log Transformation:\n")
display(df_log)


After Removing Outliers:



Unnamed: 0,Salary,Z_score
0,27000,-0.356998
1,30000,-0.346645
2,28000,-0.353547
3,26000,-0.360449
4,25500,-0.362175
5,27000,-0.356998
7,31000,-0.343194
8,29500,-0.34837



After Capping Outliers:



Unnamed: 0,Salary,Z_score
0,27000.0,-0.356998
1,30000.0,-0.346645
2,28000.0,-0.353547
3,26000.0,-0.360449
4,25500.0,-0.362175
5,27000.0,-0.356998
6,34500.0,2.828376
7,31000.0,-0.343194
8,29500.0,-0.34837



After Log Transformation:



Unnamed: 0,Salary,Z_score
0,10.203592,-0.356998
1,10.308953,-0.346645
2,10.23996,-0.353547
3,10.165852,-0.360449
4,10.146434,-0.362175
5,10.203592,-0.356998
6,13.764217,2.828376
7,10.341742,-0.343194
8,10.292146,-0.34837


In [16]:
import pandas as pd
import numpy as np

# Sample data
data = {'Age': [25, 30, 22, -5, 27, 120, 28, 26, 24, 23]}
df = pd.DataFrame(data)

# Calculate Q1, Q3, and IQR
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Detect outliers
outliers = df[(df['Age'] < lower_bound) | (df['Age'] > upper_bound)]
display("Outliers:", outliers)




'Outliers:'

Unnamed: 0,Age
3,-5
5,120


## 1. Remove outliers

In [17]:
df_removed = df[~((df['Age'] < lower_bound) | (df['Age'] > upper_bound))]
display("After removing outliers:", df_removed)

'After removing outliers:'

Unnamed: 0,Age
0,25
1,30
2,22
4,27
6,28
7,26
8,24
9,23


## 2. Capping (Winsorization)

In [18]:
df_capped = df.copy()
df_capped['Age'] = np.where(df_capped['Age'] < lower_bound, lower_bound, df_capped['Age'])
df_capped['Age'] = np.where(df_capped['Age'] > upper_bound, upper_bound, df_capped['Age'])
display("After capping outliers:", df_capped)


'After capping outliers:'

Unnamed: 0,Age
0,25.0
1,30.0
2,22.0
3,16.5
4,27.0
5,34.5
6,28.0
7,26.0
8,24.0
9,23.0


## 3. Transformation (log, only for positive values)

In [19]:
df_transformed = df[df['Age'] > 0].copy()
df_transformed['Age_log'] = np.log(df_transformed['Age'])
display("After log transformation:", df_transformed)

'After log transformation:'

Unnamed: 0,Age,Age_log
0,25,3.218876
1,30,3.401197
2,22,3.091042
4,27,3.295837
5,120,4.787492
6,28,3.332205
7,26,3.258097
8,24,3.178054
9,23,3.135494


## 4. Imputation (replace outliers with median)

In [20]:
median_age = df['Age'].median()
df_imputed = df.copy()
df_imputed['Age'] = np.where((df['Age'] < lower_bound) | (df['Age'] > upper_bound), median_age, df_imputed['Age'])
display("After imputing outliers:", df_imputed)

'After imputing outliers:'

Unnamed: 0,Age
0,25.0
1,30.0
2,22.0
3,25.5
4,27.0
5,25.5
6,28.0
7,26.0
8,24.0
9,23.0


## 5. Keep them (just identify and leave them)

In [21]:
display("Keeping outliers:", outliers)

'Keeping outliers:'

Unnamed: 0,Age
3,-5
5,120


## Data Scrubbing and Cleaning

In [22]:

data = {
    'StudentID': [101, 102, 103, 102],
    'Name': ['Alice', 'Bob', 'Charlie', 'Bob'],
    'Department': ['ECE', 'EcE', 'CSE', 'EcE'],
    'Age': [20, -5, 22, 25],
    'Score': [85, 110, 90, 88],
    'JoinDate': ['2025-01-15', '15/02/2025', '2025-03-01', '03-04-2025']
}

df = pd.DataFrame(data)
print("Original Data:\n", df)


Original Data:
    StudentID     Name Department  Age  Score    JoinDate
0        101    Alice        ECE   20     85  2025-01-15
1        102      Bob        EcE   -5    110  15/02/2025
2        103  Charlie        CSE   22     90  2025-03-01
3        102      Bob        EcE   25     88  03-04-2025


## 1. Remove duplicates

In [23]:
df = df.drop_duplicates()
display("After removing duplicates:", df)
print("\n")

'After removing duplicates:'

Unnamed: 0,StudentID,Name,Department,Age,Score,JoinDate
0,101,Alice,ECE,20,85,2025-01-15
1,102,Bob,EcE,-5,110,15/02/2025
2,103,Charlie,CSE,22,90,2025-03-01
3,102,Bob,EcE,25,88,03-04-2025






## 2. Correct typos / standardize text

In [24]:
df['Department'] = df['Department'].str.upper()
print("\n")
display("After standardizing Department:", df)





'After standardizing Department:'

Unnamed: 0,StudentID,Name,Department,Age,Score,JoinDate
0,101,Alice,ECE,20,85,2025-01-15
1,102,Bob,ECE,-5,110,15/02/2025
2,103,Charlie,CSE,22,90,2025-03-01
3,102,Bob,ECE,25,88,03-04-2025


## 3. Validate ranges

In [25]:

df['Age'] = df['Age'].apply(lambda x: np.nan if x < 0 else x)
df['Score'] = df['Score'].apply(lambda x: 100 if x > 100 else x)
print("\n")
display("After validating Age and Score:", df)





'After validating Age and Score:'

Unnamed: 0,StudentID,Name,Department,Age,Score,JoinDate
0,101,Alice,ECE,20.0,85,2025-01-15
1,102,Bob,ECE,,100,15/02/2025
2,103,Charlie,CSE,22.0,90,2025-03-01
3,102,Bob,ECE,25.0,88,03-04-2025


## 4. Standardize date format

In [26]:
df['JoinDate'] = pd.to_datetime(df['JoinDate'], dayfirst=True, errors='coerce')
print("\n")
display("After formatting JoinDate:", df)





  df['JoinDate'] = pd.to_datetime(df['JoinDate'], dayfirst=True, errors='coerce')


'After formatting JoinDate:'

Unnamed: 0,StudentID,Name,Department,Age,Score,JoinDate
0,101,Alice,ECE,20.0,85,2025-01-15
1,102,Bob,ECE,,100,NaT
2,103,Charlie,CSE,22.0,90,2025-03-01
3,102,Bob,ECE,25.0,88,NaT


## 5. Handle invalid categories (example)

In [27]:

df['Department'] = df['Department'].replace({'123': np.nan})
print("\n")
display("After handling invalid Department:", df)





'After handling invalid Department:'

Unnamed: 0,StudentID,Name,Department,Age,Score,JoinDate
0,101,Alice,ECE,20.0,85,2025-01-15
1,102,Bob,ECE,,100,NaT
2,103,Charlie,CSE,22.0,90,2025-03-01
3,102,Bob,ECE,25.0,88,NaT


## Data Transformation and Normalization

In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder

# Sample dataset
data = {
    'Age': [15, 22, 35, 45, 65],
    'Income': [20000, 40000, 50000, 80000, 120000],
    'Department': ['ECE', 'CSE', 'ME', 'ECE', 'CSE']
}
df = pd.DataFrame(data)
display("Original Data:", df)

'Original Data:'

Unnamed: 0,Age,Income,Department
0,15,20000,ECE
1,22,40000,CSE
2,35,50000,ME
3,45,80000,ECE
4,65,120000,CSE


## 1. Min-Max Normalization

In [29]:
scaler = MinMaxScaler()
df[['Age_MinMax', 'Income_MinMax']] = scaler.fit_transform(df[['Age', 'Income']])
display("\nAfter Min-Max Normalization:\n", df)


'\nAfter Min-Max Normalization:\n'

Unnamed: 0,Age,Income,Department,Age_MinMax,Income_MinMax
0,15,20000,ECE,0.0,0.0
1,22,40000,CSE,0.14,0.2
2,35,50000,ME,0.4,0.3
3,45,80000,ECE,0.6,0.6
4,65,120000,CSE,1.0,1.0


## 2. Z-score Standardization

In [30]:
scaler = StandardScaler()
df[['Age_Z', 'Income_Z']] = scaler.fit_transform(df[['Age', 'Income']])
display("\nAfter Z-score Standardization:\n", df)

'\nAfter Z-score Standardization:\n'

Unnamed: 0,Age,Income,Department,Age_MinMax,Income_MinMax,Age_Z,Income_Z
0,15,20000,ECE,0.0,0.0,-1.211847,-1.204433
1,22,40000,CSE,0.14,0.2,-0.815449,-0.630893
2,35,50000,ME,0.4,0.3,-0.07928,-0.344124
3,45,80000,ECE,0.6,0.6,0.487004,0.516185
4,65,120000,CSE,1.0,1.0,1.619571,1.663264


## 3. Encoding Categorical Data

In [31]:
le = LabelEncoder()
df['Dept_Label'] = le.fit_transform(df['Department'])

df = pd.get_dummies(df, columns=['Department'], prefix='Dept')
display("\nAfter Encoding Categorical Data:\n", df)

'\nAfter Encoding Categorical Data:\n'

Unnamed: 0,Age,Income,Age_MinMax,Income_MinMax,Age_Z,Income_Z,Dept_Label,Dept_CSE,Dept_ECE,Dept_ME
0,15,20000,0.0,0.0,-1.211847,-1.204433,1,False,True,False
1,22,40000,0.14,0.2,-0.815449,-0.630893,0,True,False,False
2,35,50000,0.4,0.3,-0.07928,-0.344124,2,False,False,True
3,45,80000,0.6,0.6,0.487004,0.516185,1,False,True,False
4,65,120000,1.0,1.0,1.619571,1.663264,0,True,False,False


## 4. Log Transformation (on Income)

In [32]:

df['Income_Log'] = np.log(df['Income'] + 1)  # Add 1 to avoid log(0)
display("\nAfter Log Transformation:\n", df)

'\nAfter Log Transformation:\n'

Unnamed: 0,Age,Income,Age_MinMax,Income_MinMax,Age_Z,Income_Z,Dept_Label,Dept_CSE,Dept_ECE,Dept_ME,Income_Log
0,15,20000,0.0,0.0,-1.211847,-1.204433,1,False,True,False,9.903538
1,22,40000,0.14,0.2,-0.815449,-0.630893,0,True,False,False,10.59666
2,35,50000,0.4,0.3,-0.07928,-0.344124,2,False,False,True,10.819798
3,45,80000,0.6,0.6,0.487004,0.516185,1,False,True,False,11.289794
4,65,120000,1.0,1.0,1.619571,1.663264,0,True,False,False,11.695255


## 5. Binning (Age groups)

In [33]:
bins = [0, 18, 35, 60, 100]
labels = ['0-18', '19-35', '36-60', '60+']
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels)
display("\nAfter Binning Age:\n", df)

'\nAfter Binning Age:\n'

Unnamed: 0,Age,Income,Age_MinMax,Income_MinMax,Age_Z,Income_Z,Dept_Label,Dept_CSE,Dept_ECE,Dept_ME,Income_Log,Age_Group
0,15,20000,0.0,0.0,-1.211847,-1.204433,1,False,True,False,9.903538,0-18
1,22,40000,0.14,0.2,-0.815449,-0.630893,0,True,False,False,10.59666,19-35
2,35,50000,0.4,0.3,-0.07928,-0.344124,2,False,False,True,10.819798,19-35
3,45,80000,0.6,0.6,0.487004,0.516185,1,False,True,False,11.289794,36-60
4,65,120000,1.0,1.0,1.619571,1.663264,0,True,False,False,11.695255,60+


## Practice SET-1

In [34]:
import pandas as pd
import numpy as np

# Seed for reproducibility
np.random.seed(42)

# Generate sample data
data = {
    'StudentID': range(1, 21),
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Hannah', 'Ian', 'Jane',
             'Kyle', 'Liam', 'Mia', 'Nora', 'Owen', 'Paula', 'Quinn', 'Ryan', 'Sophia', 'Tom'],

    # Department with typos / inconsistencies
    'Department': ['ECE', 'EcE', 'CSE', 'ME', 'ece', 'CSE', 'ME', 'EE', 'EE', 'ECE',
                   'ME', 'CSE', 'ECE', 'CSE', 'ME', '123', 'EE', 'ECE', 'CSE', 'ME'],

    # Age with some negative and extreme values
    'Age': [20, 22, 19, 21, -5, 120, 23, 24, 22, 21, 25, 30, 28, 27, 26, 200, 24, 23, 22, 21],

    # Scores with some invalid values
    'Score': [85, 90, 88, 92, 110, 95, 80, 78, 102, 87, 91, 105, 99, 77, 83, 150, 89, 84, 86, 90],

    # Income with skewed distribution
    'Income': [20000, 25000, 22000, 24000, 500000, 27000, 26000, 23000, 80000, 25000,
               24000, 26000, 27000, 28000, 30000, 1000000, 25000, 26000, 24000, 23000],

    # JoinDate in mixed formats
    'JoinDate': ['2025-01-15', '15/02/2025', '2025/03/01', '03-04-2025', '2025-05-10',
                 '10/06/2025', '2025/07/15', '15-08-2025', '2025-09-01', '01/10/2025',
                 '2025-11-05', '05/12/2025', '2025-12-20', '20/01/2025', '2025-02-14',
                 '14/03/2025', '2025-04-18', '18/05/2025', '2025-06-22', '22/07/2025']
}

# Create DataFrame
df = pd.DataFrame(data)

display("Sample Raw Data:\n", df.head(5))

'Sample Raw Data:\n'

Unnamed: 0,StudentID,Name,Department,Age,Score,Income,JoinDate
0,1,Alice,ECE,20,85,20000,2025-01-15
1,2,Bob,EcE,22,90,25000,15/02/2025
2,3,Charlie,CSE,19,88,22000,2025/03/01
3,4,David,ME,21,92,24000,03-04-2025
4,5,Eva,ece,-5,110,500000,2025-05-10


## Practice SET-2

In [35]:
import pandas as pd
import numpy as np

# Seed for reproducibility
np.random.seed(123)

# Generate sample dataset
data = {
    'EmployeeID': range(1, 21),

    # Name with typos and duplicates
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Hannah', 'Ian', 'Jane',
             'Alice', 'Liam', 'Mia', 'Nora', 'Owen', 'Paula', 'Quinn', 'Ryan', 'Sophia', 'Tom'],

    # Department with inconsistent naming
    'Department': ['HR', 'hr', 'Finance', 'FIN', 'IT', 'IT', 'Hr', 'Finance', 'IT', 'IT',
                   'HR', 'FIN', 'Finance', 'HR', 'IT', 'Unknown', 'Finance', 'HR', 'IT', 'HR'],

    # Salary with extreme outliers
    'Salary': [50000, 52000, 48000, 51000, 49000, 53000, 1000000, 52000, 51000, 50000,
               49500, 51000, 50500, 53000, 49000, 0, 52000, 51000, 50000, 49500],

    # Years of Experience with invalid / extreme values
    'Experience': [2, 5, 3, -1, 4, 6, 20, 5, 3, 4, 2, 5, 100, 6, 3, 2, 5, 3, 4, 2],

    # Joining Date with mixed formats and some missing
    'JoiningDate': ['2020-01-15', '15/02/2019', '2021/03/01', '03-04-2018', '2019-05-10',
                    '10/06/2017', '2015-07-15', '15-08-2020', '2018-09-01', '01/10/2016',
                    None, '05/12/2015', '2020-12-20', '20/01/2019', '2018-02-14',
                    '14/03/2017', '2019-04-18', '18/05/2020', '2021-06-22', '22/07/2018'],

    # Performance score with skewed distribution and missing values
    'Performance': [3, 4, 5, 2, 4, 3, 10, 4, 3, 2, 4, np.nan, 3, 4, 2, 1, 5, 4, 3, 2],

    # Education Level (categorical)
    'Education': ['Bachelors', 'Masters', 'Bachelors', 'PhD', 'Bachelors', 'Masters', 'PhD', 'Masters',
                  'Bachelors', 'Bachelors', 'Masters', 'Bachelors', 'PhD', 'Masters', 'Bachelors',
                  'Masters', 'Bachelors', 'PhD', 'Masters', 'Bachelors']
}

df = pd.DataFrame(data)
display("Sample Raw Data:\n", df.head(5))


'Sample Raw Data:\n'

Unnamed: 0,EmployeeID,Name,Department,Salary,Experience,JoiningDate,Performance,Education
0,1,Alice,HR,50000,2,2020-01-15,3.0,Bachelors
1,2,Bob,hr,52000,5,15/02/2019,4.0,Masters
2,3,Charlie,Finance,48000,3,2021/03/01,5.0,Bachelors
3,4,David,FIN,51000,-1,03-04-2018,2.0,PhD
4,5,Eva,IT,49000,4,2019-05-10,4.0,Bachelors


## Practice SET-3

In [36]:
import pandas as pd
import numpy as np

# Seed for reproducibility
np.random.seed(789)

# Generate sample dataset
data_new = {
    'CustomerID': range(1, 21),

    # Customer names with typos and duplicates
    'CustomerName': ['Liam', 'Olivia', 'Noah', 'Emma', 'Ava', 'Sophia', 'Isabella', 'Mia', 'Amelia', 'Harper',
                     'Liam', 'Evelyn', 'Abigail', 'Ella', 'Elizabeth', 'Sofia', 'Madison', 'Scarlett', 'Victoria', 'Grace'],

    # Country with inconsistent naming
    'Country': ['USA', 'usa', 'UK', 'U.K.', 'India', 'india', 'Canada', 'CAN', 'Germany', 'GER',
                'USA', 'UK', 'India', 'CAN', 'Germany', 'Unknown', 'UK', 'USA', 'India', 'GER'],

    # Age with invalid/outlier values
    'Age': [25, 30, 28, -5, 35, 200, 40, 29, 31, 33, 25, 32, 27, 150, 36, 0, 38, 30, 34, 26],

    # Account Balance with extreme outliers
    'AccountBalance': [1500, 2000, 2500, 3000, 5000, 1000000, 3500, 2200, 2100, 1800,
                       1550, 2400, 2700, 500000, 3200, 0, 2800, 2300, 2600, 1900],

    # Signup Date with mixed formats and missing values
    'SignupDate': ['2021-01-15', '15/02/2020', '2021/03/01', '03-04-2019', '2020-05-10',
                   '10/06/2018', '2017-07-15', '15-08-2021', '2018-09-01', None,
                   '01/10/2016', '05/12/2015', '2020-12-20', '20/01/2019', '2018-02-14',
                   '14/03/2017', '2019-04-18', '18/05/2020', '2021-06-22', '22/07/2018'],

    # Customer Type with typos/inconsistencies
    'CustomerType': ['Regular', 'regular', 'VIP', 'vip', 'Occasional', 'occasional', 'Regular', 'VIP',
                     'Occasional', 'Regular', 'Regular', 'VIP', 'VIP', 'Occasional', 'Regular',
                     'VIP', 'Occasional', 'Regular', 'VIP', 'Occasional'],

    # Number of Purchases with unrealistic/missing values
    'Purchases': [5, 10, 7, 3, 8, 100, 6, 9, 4, 5, 7, np.nan, 8, 120, 6, 0, 10, 7, 8, 5],

    # Feedback Score (1-5) with missing and outliers
    'FeedbackScore': [4, 5, 3, 2, 6, 5, 4, 3, 2, 5, 4, np.nan, 3, 4, 2, 1, 5, 4, 3, 2]
}

df_new = pd.DataFrame(data_new)

# Display first 5 rows
display("Sample Data for Data Cleaning Assignment:\n", df_new.head(5))

'Sample Data for Data Cleaning Assignment:\n'

Unnamed: 0,CustomerID,CustomerName,Country,Age,AccountBalance,SignupDate,CustomerType,Purchases,FeedbackScore
0,1,Liam,USA,25,1500,2021-01-15,Regular,5.0,4.0
1,2,Olivia,usa,30,2000,15/02/2020,regular,10.0,5.0
2,3,Noah,UK,28,2500,2021/03/01,VIP,7.0,3.0
3,4,Emma,U.K.,-5,3000,03-04-2019,vip,3.0,2.0
4,5,Ava,India,35,5000,2020-05-10,Occasional,8.0,6.0


## Practice SET-4

In [37]:
import pandas as pd
import numpy as np

# Seed for reproducibility
np.random.seed(101)

# Generate sample dataset
hospital_data = {
    'PatientID': range(1, 21),

    # Patient names with typos/duplicates
    'PatientName': ['John Doe', 'Jane Smith', 'Jake Lee', 'Jill Brown', 'James White', 'Julia Black',
                    'Jim Green', 'Joan Blue', 'Jack Grey', 'Jerry Pink', 'John Doe', 'Janet Silver',
                    'Abigail Gold', 'Ella Violet', 'Elizabeth Orange', 'Sofia Red', 'Madison Yellow',
                    'Scarlett Indigo', 'Victoria Cyan', 'Grace Magenta'],

    # Gender with inconsistent values
    'Gender': ['M', 'F', 'Male', 'Female', 'M', 'F', 'M', 'F', 'Male', 'Female',
               'M', 'F', 'Male', 'F', 'M', 'F', 'Male', 'Female', 'M', 'F'],

    # Age with outliers and invalid values
    'Age': [25, 30, 28, -1, 35, 200, 40, 29, 31, 33, 25, 32, 27, 150, 36, 0, 38, 30, 34, 26],

    # Admission Date with mixed formats and missing values
    'AdmissionDate': ['2023-01-10', '10/02/2022', '2021/03/05', '05-04-2019', '2020-05-12',
                      '12/06/2018', '2017-07-20', None, '2018-09-15', '15/10/2016',
                      '01-11-2020', '05/12/2015', '2020-12-25', '25/01/2019', '2018-02-20',
                      '20/03/2017', '2019-04-25', '25/05/2020', '2021-06-28', '28/07/2018'],

    # Department with inconsistent naming
    'Department': ['Cardiology', 'cardiology', 'Neurology', 'NEURO', 'Orthopedics', 'orthopedics',
                   'Oncology', 'ONC', 'Pediatrics', 'PED', 'Cardiology', 'Neurology', 'Oncology',
                   'Orthopedics', 'Cardiology', 'Pediatrics', 'Neurology', 'Cardiology', 'Oncology', 'PED'],

    # Room Number with missing/duplicate values
    'RoomNumber': [101, 102, 103, 104, 105, 106, 107, None, 109, 110, 101, 112, 113, 114, 115, 116, None, 118, 119, 120],

    # Bill Amount with extreme values
    'BillAmount': [5000, 7000, 6000, 5500, 8000, 1000000, 6500, 7200, 6100, 5800,
                   5000, 6900, 6400, 500000, 6700, 0, 7100, 6000, 6800, 5900],

    # Discharge Status with inconsistent categories
    'DischargeStatus': ['Recovered', 'recovered', 'Deceased', 'deceased', 'Transferred', 'transferred',
                        'Recovered', 'Recovered', 'Deceased', 'recovered', 'Recovered', 'Deceased',
                        'Transferred', 'recovered', 'Recovered', 'Deceased', 'Transferred', 'recovered',
                        'Recovered', 'Deceased']
}

df_hospital = pd.DataFrame(hospital_data)

# Display first 5 rows
display("Sample Hospital Management Data for Cleaning Assignment:\n", df_hospital.head(5))

'Sample Hospital Management Data for Cleaning Assignment:\n'

Unnamed: 0,PatientID,PatientName,Gender,Age,AdmissionDate,Department,RoomNumber,BillAmount,DischargeStatus
0,1,John Doe,M,25,2023-01-10,Cardiology,101.0,5000,Recovered
1,2,Jane Smith,F,30,10/02/2022,cardiology,102.0,7000,recovered
2,3,Jake Lee,Male,28,2021/03/05,Neurology,103.0,6000,Deceased
3,4,Jill Brown,Female,-1,05-04-2019,NEURO,104.0,5500,deceased
4,5,James White,M,35,2020-05-12,Orthopedics,105.0,8000,Transferred
