# Cleaning Data in Pandas DataFrames

## 1. Import libraries and dependencies

In [88]:
# Import the pandas and pathlib libraries
import pandas as pd
from pathlib import Path

## 2. Create a Path to the File Using Pathlib

In [89]:
# Use the Pathlib library to set the path to the CSV
pathfile=Path("../Resources/people_reordered.csv")

## 3. Read the CSV into a Pandas DataFrame

In [90]:
# Use the file path to read the CSV into a DataFrame and display a few rows
ppl_DF = pd.read_csv(pathfile)
ppl_DF.head()

Unnamed: 0.1,Unnamed: 0,Person_ID,Last_Name,First_Name,Gender,University,Occupation,Salary,Email,Age
0,0,1.0,Lenormand,Keriann,Female,Aurora University,Nurse Practicioner,58135.0,klenormand0@businessinsider.com,27
1,1,2.0,Rupke,Huntley,Male,Osaka University of Economics,Project Manager,96053.0,hrupke1@reuters.com,22
2,2,3.0,Dalgarnowch,Gorden,Male,Ludong University,Environmental Tech,59196.0,gdalgarnowch2@microsoft.com,40
3,3,4.0,,Cullie,Male,Université des Sciences et de la Technologie d...,Legal Assistant,88493.0,cputten3@nymag.com,62
4,4,5.0,Strangman,Ariel,Female,Boise State University,Project Manager,89073.0,astrangman4@bravesites.com,47


## 4. View Column Data Types

In [91]:
# Use the `dtypes` attribute to list the column data types
ppl_DF.dtypes


Unnamed: 0      int64
Person_ID     float64
Last_Name      object
First_Name     object
Gender         object
University     object
Occupation     object
Salary        float64
Email          object
Age             int64
dtype: object

## 5. Drop Extraneous Columns

In [92]:
# Use the `drop` function to drop specific columns
ppl_DF.drop(columns=["Unnamed: 0"], inplace=True)
ppl_DF.head()

Unnamed: 0,Person_ID,Last_Name,First_Name,Gender,University,Occupation,Salary,Email,Age
0,1.0,Lenormand,Keriann,Female,Aurora University,Nurse Practicioner,58135.0,klenormand0@businessinsider.com,27
1,2.0,Rupke,Huntley,Male,Osaka University of Economics,Project Manager,96053.0,hrupke1@reuters.com,22
2,3.0,Dalgarnowch,Gorden,Male,Ludong University,Environmental Tech,59196.0,gdalgarnowch2@microsoft.com,40
3,4.0,,Cullie,Male,Université des Sciences et de la Technologie d...,Legal Assistant,88493.0,cputten3@nymag.com,62
4,5.0,Strangman,Ariel,Female,Boise State University,Project Manager,89073.0,astrangman4@bravesites.com,47


---

## 6. Identify Data Quality Issues

### 1. Identify the Number of Rows

In [93]:
# Use the `count` function to view count of non-null values for each column
ppl_DF.count()

Person_ID      996
Last_Name      990
First_Name     993
Gender         998
University     991
Occupation     998
Salary         998
Email          992
Age           1000
dtype: int64

### 2. Identify Frequency Counts of a Specific Column

In [94]:
# Identifying frequency counts of the `first_name` column
ppl_DF["First_Name"].value_counts()


Israel       3
Ailbert      3
Jonell       2
Joey         2
Onofredo     2
            ..
Evaleen      1
Ryley        1
Tomaso       1
Augustine    1
Paten        1
Name: First_Name, Length: 914, dtype: int64

### 3. Identify Null Values

In [95]:
# Checking for null
ppl_DF.isnull()

Unnamed: 0,Person_ID,Last_Name,First_Name,Gender,University,Occupation,Salary,Email,Age
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,True,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False,False
996,False,False,False,False,False,False,False,False,False
997,False,False,False,False,False,False,False,False,False
998,False,False,False,False,False,False,False,False,False


### 4. Determine the Number of Nulls

In [96]:
# Determining number of nulls
ppl_DF.isnull().sum()


Person_ID      4
Last_Name     10
First_Name     7
Gender         2
University     9
Occupation     2
Salary         2
Email          8
Age            0
dtype: int64

### 5. Determining the Percentage of Nulls for each Column

In [97]:
# Determining percentage of nulls
ppl_DF.isnull().sum() / len(ppl_DF) * 100

Person_ID     0.4
Last_Name     1.0
First_Name    0.7
Gender        0.2
University    0.9
Occupation    0.2
Salary        0.2
Email         0.8
Age           0.0
dtype: float64

In [98]:
ppl_DF.isnull().mean()*100

Person_ID     0.4
Last_Name     1.0
First_Name    0.7
Gender        0.2
University    0.9
Occupation    0.2
Salary        0.2
Email         0.8
Age           0.0
dtype: float64

### 6. Check for Duplicate Rows

In [99]:
# Use the `duplicated` function to determine the existence of duplicate rows: True or False
ppl_DF.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999     True
Length: 1000, dtype: bool

### 7. Check for Duplicate `First_Name` Values

In [100]:
# Use the `duplicated` function in conjunction with a list of columns to 
# determine the existence of duplicate rows based on the selected columns

ppl_DF[["First_Name", "Last_Name"]].duplicated()




0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999     True
Length: 1000, dtype: bool

---

## 7. Resolve Data Quality Issues

### 1. Fill First_Name and Last_Name Null Values with Default Value "Unnamed"

In [101]:
# Cleanse nulls from DataFrame by filling na
ppl_DF["First_Name"] = ppl_DF["First_Name"].fillna("Unnamed")
ppl_DF["Last_Name"] = ppl_DF["Last_Name"].fillna("Unnamed")
ppl_DF.head(10)


Unnamed: 0,Person_ID,Last_Name,First_Name,Gender,University,Occupation,Salary,Email,Age
0,1.0,Lenormand,Keriann,Female,Aurora University,Nurse Practicioner,58135.0,klenormand0@businessinsider.com,27
1,2.0,Rupke,Huntley,Male,Osaka University of Economics,Project Manager,96053.0,hrupke1@reuters.com,22
2,3.0,Dalgarnowch,Gorden,Male,Ludong University,Environmental Tech,59196.0,gdalgarnowch2@microsoft.com,40
3,4.0,Unnamed,Cullie,Male,Université des Sciences et de la Technologie d...,Legal Assistant,88493.0,cputten3@nymag.com,62
4,5.0,Strangman,Ariel,Female,Boise State University,Project Manager,89073.0,astrangman4@bravesites.com,47
5,6.0,Snozzwell,Antonio,Male,Babcock University,Technical Writer,119916.0,asnozzwell5@mysql.com,49
6,7.0,Neathway,Darya,Female,Molloy College,Administrative Officer,77705.0,dneathway6@seesaa.net,64
7,8.0,Duding,Helaina,Female,St. Paul University,Staff Scientist,57166.0,hduding7@topsy.com,26
8,9.0,Franzolini,Gerhardine,Female,Fundação Educacional de Ituverava,Environmental Specialist,73051.0,gfranzolini8@msn.com,22
9,10.0,Traut,Charo,Female,Cornell University,Programmer III,90631.0,ctraut9@oracle.com,45


### 2. Drop Remaining Records with Nulls from DataFrame

In [102]:
# Use the `dropna` function to drop whole records that have at least one null value
ppl_DF.dropna(inplace=True)
ppl_DF.head()


Unnamed: 0,Person_ID,Last_Name,First_Name,Gender,University,Occupation,Salary,Email,Age
0,1.0,Lenormand,Keriann,Female,Aurora University,Nurse Practicioner,58135.0,klenormand0@businessinsider.com,27
1,2.0,Rupke,Huntley,Male,Osaka University of Economics,Project Manager,96053.0,hrupke1@reuters.com,22
2,3.0,Dalgarnowch,Gorden,Male,Ludong University,Environmental Tech,59196.0,gdalgarnowch2@microsoft.com,40
3,4.0,Unnamed,Cullie,Male,Université des Sciences et de la Technologie d...,Legal Assistant,88493.0,cputten3@nymag.com,62
4,5.0,Strangman,Ariel,Female,Boise State University,Project Manager,89073.0,astrangman4@bravesites.com,47


### 3. Check Null Counts for Each Column (Again)

In [109]:
# Use the `isnull` function in conjunction with the `sum` function to count the number of null values for each column
ppl_DF.isnull().sum()

Person_ID     0
Last_Name     0
First_Name    0
Gender        0
University    0
Occupation    0
Salary        0
Email         0
Age           0
dtype: int64

### 4. Cleanse data by Dropping Duplicates

In [110]:
# Use the `drop_duplicates` function with the `subset` parameter to 
# drop duplicates based on a selection of columns
ppl_DF.drop_duplicates(subset=["First_Name", "Last_Name"])

Unnamed: 0,Person_ID,Last_Name,First_Name,Gender,University,Occupation,Salary,Email,Age
0,1.0,Lenormand,Keriann,Female,Aurora University,Nurse Practicioner,58135.0,klenormand0@businessinsider.com,27
1,2.0,Rupke,Huntley,Male,Osaka University of Economics,Project Manager,96053.0,hrupke1@reuters.com,22
2,3.0,Dalgarnowch,Gorden,Male,Ludong University,Environmental Tech,59196.0,gdalgarnowch2@microsoft.com,40
3,4.0,Unnamed,Cullie,Male,Université des Sciences et de la Technologie d...,Legal Assistant,88493.0,cputten3@nymag.com,62
4,5.0,Strangman,Ariel,Female,Boise State University,Project Manager,89073.0,astrangman4@bravesites.com,47
...,...,...,...,...,...,...,...,...,...
994,995.0,Jeacop,Iggy,Male,University of Greenwich,Sales Associate,82726.0,ijeacoprm@4shared.com,22
995,996.0,Crumpton,Meta,Female,ECAM - Institut Supérieur Industriel,Registered Nurse,57060.0,mcrumptonrn@qq.com,52
996,997.0,Gilford,Gunar,Male,Smolny University,Marketing Manager,76109.0,ggilfordro@yandex.ru,32
997,998.0,Gurling,Lucretia,Female,Institut Teknologi Telkom,Software Engineer III,92115.0,lgurlingrp@de.vu,48


### 5. Convert Columns to Different DataTypes

In [112]:
# Use the `as_type` function to convert `Person_ID` from `float` to `int`
ppl_DF["Person_ID"] = ppl_DF["Person_ID"].astype("int")
ppl_DF

Unnamed: 0,Person_ID,Last_Name,First_Name,Gender,University,Occupation,Salary,Email,Age
0,1,Lenormand,Keriann,Female,Aurora University,Nurse Practicioner,58135.0,klenormand0@businessinsider.com,27
1,2,Rupke,Huntley,Male,Osaka University of Economics,Project Manager,96053.0,hrupke1@reuters.com,22
2,3,Dalgarnowch,Gorden,Male,Ludong University,Environmental Tech,59196.0,gdalgarnowch2@microsoft.com,40
3,4,Unnamed,Cullie,Male,Université des Sciences et de la Technologie d...,Legal Assistant,88493.0,cputten3@nymag.com,62
4,5,Strangman,Ariel,Female,Boise State University,Project Manager,89073.0,astrangman4@bravesites.com,47
...,...,...,...,...,...,...,...,...,...
995,996,Crumpton,Meta,Female,ECAM - Institut Supérieur Industriel,Registered Nurse,57060.0,mcrumptonrn@qq.com,52
996,997,Gilford,Gunar,Male,Smolny University,Marketing Manager,76109.0,ggilfordro@yandex.ru,32
997,998,Gurling,Lucretia,Female,Institut Teknologi Telkom,Software Engineer III,92115.0,lgurlingrp@de.vu,48
998,999,Yang,Andrew,Male,Rutgers University School of Business,Curriculum Engineer,60000.0,ayang@codedrills.com,53


## 8. Save Cleansed Data to New CSV

In [113]:
# Save modified DataFrame to the Resources folder. 
# Use the `index` parameter set to `False` to exclude saving the index.
ppl_DF.to_csv("../Resources/ppl_clean.csv")