### Importing Pandas Library

In [1]:
import pandas as pd 

### Read the csv file into a pandas Dataframe

In [2]:
data = pd.read_csv('sample.csv')

### Explore the dataset 

In [3]:
data.head()

Unnamed: 0,id,name,age,email,phone
0,1,John Doe,25.0,johndoe@example.com,123-456-7890
1,2,Jane Smith,32.0,janesmith@example.com,987-654-3210
2,3,Mark Johnson,,markjohnson@example.com,
3,4,Lisa Brown,40.0,,345-678-9012


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      4 non-null      int64  
 1   name    4 non-null      object 
 2   age     3 non-null      float64
 3   email   3 non-null      object 
 4   phone   3 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 288.0+ bytes


### Handle Missing Values

In [5]:
data['age'].fillna(0, inplace=True)  # Replace missing age values with 0
data['email'].fillna('', inplace=True)  # Replace missing email values with an empty string
data['phone'].fillna('', inplace=True)  # Replace missing phone values with an empty string

In [6]:
data.head()

Unnamed: 0,id,name,age,email,phone
0,1,John Doe,25.0,johndoe@example.com,123-456-7890
1,2,Jane Smith,32.0,janesmith@example.com,987-654-3210
2,3,Mark Johnson,0.0,markjohnson@example.com,
3,4,Lisa Brown,40.0,,345-678-9012


### Standardize formats

In [7]:
data['email'] = data['email'].str.lower()  # Convert email addresses to lowercase
data['phone'] = data['phone'].str.replace('-', '')  # Remove dashes from phone numbers


In [8]:
data.head()

Unnamed: 0,id,name,age,email,phone
0,1,John Doe,25.0,johndoe@example.com,1234567890.0
1,2,Jane Smith,32.0,janesmith@example.com,9876543210.0
2,3,Mark Johnson,0.0,markjohnson@example.com,
3,4,Lisa Brown,40.0,,3456789012.0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      4 non-null      int64  
 1   name    4 non-null      object 
 2   age     4 non-null      float64
 3   email   4 non-null      object 
 4   phone   4 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 288.0+ bytes


### Remove duplicates 

In [10]:
data.drop_duplicates(subset=['email'], inplace= True)   # Remvove duplicate rows based on email column

In [11]:
data.head()

Unnamed: 0,id,name,age,email,phone
0,1,John Doe,25.0,johndoe@example.com,1234567890.0
1,2,Jane Smith,32.0,janesmith@example.com,9876543210.0
2,3,Mark Johnson,0.0,markjohnson@example.com,
3,4,Lisa Brown,40.0,,3456789012.0


### Save the cleaned data to a new CSV file

In [12]:
data.to_csv('cleaned_sample_data.csv', index= False)

### More cleaning data with mean value

In [14]:
mean_age = data[data['age'] !=0]['age'].mean()

In [15]:
data.loc[data['age'] == 0, 'age'] = mean_age

In [16]:
data.head()

Unnamed: 0,id,name,age,email,phone
0,1,John Doe,25.0,johndoe@example.com,1234567890.0
1,2,Jane Smith,32.0,janesmith@example.com,9876543210.0
2,3,Mark Johnson,32.333333,markjohnson@example.com,
3,4,Lisa Brown,40.0,,3456789012.0


### Replace null email values with name + "@example.com"

In [17]:
data.loc[data['email'] == '', 'email'] = data['name'].str.replace(' ', '') + '@example.com'


In [18]:
data.head()

Unnamed: 0,id,name,age,email,phone
0,1,John Doe,25.0,johndoe@example.com,1234567890.0
1,2,Jane Smith,32.0,janesmith@example.com,9876543210.0
2,3,Mark Johnson,32.333333,markjohnson@example.com,
3,4,Lisa Brown,40.0,LisaBrown@example.com,3456789012.0


### Standardize formats

In [19]:
data['email'] = data['email'].str.lower()  # Convert email addresses to lowercase


In [20]:
data.head()

Unnamed: 0,id,name,age,email,phone
0,1,John Doe,25.0,johndoe@example.com,1234567890.0
1,2,Jane Smith,32.0,janesmith@example.com,9876543210.0
2,3,Mark Johnson,32.333333,markjohnson@example.com,
3,4,Lisa Brown,40.0,lisabrown@example.com,3456789012.0


### Convert 'age' column to integer

In [21]:
data['age'] = data['age'].astype(int)


In [22]:
data.head()

Unnamed: 0,id,name,age,email,phone
0,1,John Doe,25,johndoe@example.com,1234567890.0
1,2,Jane Smith,32,janesmith@example.com,9876543210.0
2,3,Mark Johnson,32,markjohnson@example.com,
3,4,Lisa Brown,40,lisabrown@example.com,3456789012.0


### Save the cleaned data to a new CSV file again

In [23]:
data.to_csv('cleaned_sample_data_final.csv', index=False)