# Data Loading

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
data = pd.read_csv('messy_data set.csv')
data

Unnamed: 0,ID,Name,Age,Email,JoinDate,Salary
0,1,Alice,25,alice@example.com,01-01-2020,50000
1,2,Bob,30,bob@example.com,not a date,60000
2,2,Bob,30,bob@example.com,01-02-2020,60000
3,4,Charlie,,charlie@,01-03-2020,70000
4,5,David,Twenty-two,,01-04-2020,
5,6,Eve,45,eve@example,01-05-2020,90000
6,7,Frank,50,frank@@example.com,01-06-2020,100000
7,8,Grace,55,grace@example.com,01-07-2020,one lakh
8,nine,Heidi,60,heidi@example.com,2020.08.01,120000
9,10,,65,10@example.com,01-09-2020,130000


# Identifying Missing Value

In [3]:
missing_data = data.isnull().sum()
print('Missing_data:')
missing_data

Missing_data:


ID          0
Name        1
Age         1
Email       1
JoinDate    0
Salary      1
dtype: int64

# Checking DataTypes

In [4]:
data.dtypes

ID          object
Name        object
Age         object
Email       object
JoinDate    object
Salary      object
dtype: object

# Assigning DataTypes

In [8]:
data = data.replace('Twenty-two',22)

In [10]:
data['Age'] = data['Age'].astype(float)

In [11]:
data = data.replace('one lakh',100000)

In [12]:
data['Salary'] = data['Salary'].astype(float)

In [13]:
data

Unnamed: 0,ID,Name,Age,Email,JoinDate,Salary
0,1,Alice,25.0,alice@example.com,01-01-2020,50000.0
1,2,Bob,30.0,bob@example.com,not a date,60000.0
2,2,Bob,30.0,bob@example.com,01-02-2020,60000.0
3,4,Charlie,,charlie@,01-03-2020,70000.0
4,5,David,22.0,,01-04-2020,
5,6,Eve,45.0,eve@example,01-05-2020,90000.0
6,7,Frank,50.0,frank@@example.com,01-06-2020,100000.0
7,8,Grace,55.0,grace@example.com,01-07-2020,100000.0
8,nine,Heidi,60.0,heidi@example.com,2020.08.01,120000.0
9,10,,65.0,10@example.com,01-09-2020,130000.0


In [14]:
data = data.replace('not a date',"NaN")

In [15]:
data = data.replace('2020.08.01',"NaN")

# Imputing Missing Value

In [16]:
from sklearn.impute import SimpleImputer

In [17]:
imputer = SimpleImputer(strategy = 'mean')
imputer.fit(data[['Age']])
data[['Age']] = imputer.transform(data[['Age']])

In [18]:
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(data[['JoinDate']])
data[['JoinDate']] = imputer.transform(data[['JoinDate']]) 

In [19]:
imputer = SimpleImputer(strategy = 'most_frequent')
imputer.fit(data[['Email']])
data [['Email']] = imputer.transform(data[['Email']])

In [20]:
imputer = SimpleImputer(strategy = 'most_frequent')
imputer.fit(data[['Name']])
data [['Name']] = imputer.transform(data[['Name']])

In [21]:
data = data.replace('nine',9)

In [22]:
imputer = SimpleImputer(strategy='mean')
imputer.fit(data[['Salary']])
data[['Salary']] = imputer.transform(data[['Salary']])

In [23]:
data

Unnamed: 0,ID,Name,Age,Email,JoinDate,Salary
0,1,Alice,25.0,alice@example.com,01-01-2020,50000.0
1,2,Bob,30.0,bob@example.com,,60000.0
2,2,Bob,30.0,bob@example.com,01-02-2020,60000.0
3,4,Charlie,42.444444,charlie@,01-03-2020,70000.0
4,5,David,22.0,bob@example.com,01-04-2020,86666.666667
5,6,Eve,45.0,eve@example,01-05-2020,90000.0
6,7,Frank,50.0,frank@@example.com,01-06-2020,100000.0
7,8,Grace,55.0,grace@example.com,01-07-2020,100000.0
8,9,Heidi,60.0,heidi@example.com,,120000.0
9,10,Bob,65.0,10@example.com,01-09-2020,130000.0


In [24]:
data.dtypes

ID           object
Name         object
Age         float64
Email        object
JoinDate     object
Salary      float64
dtype: object

In [25]:
# Assigning Datatypes
data['JoinDate'] = pd.to_datetime(data['JoinDate'])

In [27]:
data.dtypes

ID                  object
Name                object
Age                float64
Email               object
JoinDate    datetime64[ns]
Salary             float64
dtype: object

In [28]:
# Applying Filter_warning
import warnings
warnings.filterwarnings('ignore')

In [29]:
data['JoinDate'] = data['JoinDate'].fillna(method="ffill")

In [30]:
data

Unnamed: 0,ID,Name,Age,Email,JoinDate,Salary
0,1,Alice,25.0,alice@example.com,2020-01-01,50000.0
1,2,Bob,30.0,bob@example.com,2020-01-01,60000.0
2,2,Bob,30.0,bob@example.com,2020-01-02,60000.0
3,4,Charlie,42.444444,charlie@,2020-01-03,70000.0
4,5,David,22.0,bob@example.com,2020-01-04,86666.666667
5,6,Eve,45.0,eve@example,2020-01-05,90000.0
6,7,Frank,50.0,frank@@example.com,2020-01-06,100000.0
7,8,Grace,55.0,grace@example.com,2020-01-07,100000.0
8,9,Heidi,60.0,heidi@example.com,2020-01-07,120000.0
9,10,Bob,65.0,10@example.com,2020-01-09,130000.0


# Removing Inconsistent Value

In [31]:
unique_value = data['ID'].unique()
unique_value

array(['1', '2', '4', '5', '6', '7', '8', 9, '10'], dtype=object)

In [32]:
unique_value = data['Email'].unique()
unique_value

array(['alice@example.com', 'bob@example.com', 'charlie@', 'eve@example',
       'frank@@example.com', 'grace@example.com', 'heidi@example.com',
       '10@example.com'], dtype=object)

# Dealing With Duplicate Value

In [33]:
duplicates = data.duplicated()
data[duplicates]

Unnamed: 0,ID,Name,Age,Email,JoinDate,Salary


In [34]:
duplicates = data.duplicated()
duplicates

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
dtype: bool