# Cleaning Data in Python

## Common Data Problems

In [2]:
# pip install pandas

In [26]:
import pandas as pd
ride_sharing = pd.read_csv(r'.\data\ride_sharing_new.csv', index_col=0)
ride_sharing.head(3)

Unnamed: 0,duration,station_A_id,station_A_name,station_B_id,station_B_name,bike_id,user_type,user_birth_year,user_gender
0,12 minutes,81,Berry St at 4th St,323,Broadway at Kearny,5480,2,1959,Male
1,24 minutes,3,Powell St BART Station (Market St at 4th St),118,Eureka Valley Recreation Center,5193,2,1965,Male
2,8 minutes,67,San Francisco Caltrain Station 2 (Townsend St...,23,The Embarcadero at Steuart St,3652,3,1993,Male


In [4]:
# Print the information of ride_sharing
display(ride_sharing.info())

# Print summary statistics of user_type column
display(ride_sharing['user_type'].describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25760 entries, 0 to 25759
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       25760 non-null  int64 
 1   duration         25760 non-null  object
 2   station_A_id     25760 non-null  int64 
 3   station_A_name   25760 non-null  object
 4   station_B_id     25760 non-null  int64 
 5   station_B_name   25760 non-null  object
 6   bike_id          25760 non-null  int64 
 7   user_type        25760 non-null  int64 
 8   user_birth_year  25760 non-null  int64 
 9   user_gender      25760 non-null  object
dtypes: int64(6), object(4)
memory usage: 2.0+ MB


None

count    25760.000000
mean         2.008385
std          0.704541
min          1.000000
25%          2.000000
50%          2.000000
75%          3.000000
max          3.000000
Name: user_type, dtype: float64

By looking at the summary statistics - they don't really seem to offer much description on how users are distributed along their purchase type, why do you think that is?

In [5]:
# Print the information of ride_sharing
print(ride_sharing.info())

# Print summary statistics of user_type column
print(ride_sharing['user_type'].describe())

# Convert user_type from integer to category
ride_sharing['user_type_cat'] = ride_sharing['user_type'].astype('category')

# Write an assert statement confirming the change
assert ride_sharing['user_type_cat'].dtype == 'category'

# Print new summary statistics 
print(ride_sharing['user_type_cat'].describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25760 entries, 0 to 25759
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       25760 non-null  int64 
 1   duration         25760 non-null  object
 2   station_A_id     25760 non-null  int64 
 3   station_A_name   25760 non-null  object
 4   station_B_id     25760 non-null  int64 
 5   station_B_name   25760 non-null  object
 6   bike_id          25760 non-null  int64 
 7   user_type        25760 non-null  int64 
 8   user_birth_year  25760 non-null  int64 
 9   user_gender      25760 non-null  object
dtypes: int64(6), object(4)
memory usage: 2.0+ MB
None
count    25760.000000
mean         2.008385
std          0.704541
min          1.000000
25%          2.000000
50%          2.000000
75%          3.000000
max          3.000000
Name: user_type, dtype: float64
count     25760
unique        3
top           2
freq      12972
Name: user_type_cat, dty

In [6]:
# Strip duration of minutes
ride_sharing['duration_trim'] = ride_sharing['duration'].str.strip(' minutes')

# Convert duration to integer
ride_sharing['duration_time'] = ride_sharing['duration_trim'].astype('int')

# Write an assert statement making sure of conversion
assert ride_sharing['duration_time'].dtype == 'int'

# Print formed columns and calculate average ride duration 
display(ride_sharing[['duration','duration_trim','duration_time']])
print(ride_sharing['duration_time'].mean())

Unnamed: 0,duration,duration_trim,duration_time
0,12 minutes,12,12
1,24 minutes,24,24
2,8 minutes,8,8
3,4 minutes,4,4
4,11 minutes,11,11
...,...,...,...
25755,11 minutes,11,11
25756,10 minutes,10,10
25757,14 minutes,14,14
25758,14 minutes,14,14


11.389052795031056


## Data Range Constraints

In [24]:
import datetime as dtt
banking = pd.read_csv(r'.\data\banking_dirty.csv', index_col=0)
banking.head(3)

Unnamed: 0,cust_id,birth_date,Age,acct_amount,inv_amount,fund_A,fund_B,fund_C,fund_D,account_opened,last_transaction
0,870A9281,1962-06-09,58,63523.31,51295,30105.0,4138.0,1420.0,15632.0,02-09-18,22-02-19
1,166B05B0,1962-12-16,58,38175.46,15050,4995.0,938.0,6696.0,2421.0,28-02-19,31-10-18
2,BFC13E88,1990-09-12,34,59863.77,24567,10323.0,4590.0,8469.0,1185.0,25-04-18,02-04-18


In [13]:
banking.dtypes

Unnamed: 0            int64
cust_id              object
birth_date           object
Age                   int64
acct_amount         float64
inv_amount            int64
fund_A              float64
fund_B              float64
fund_C              float64
fund_D              float64
account_opened       object
last_transaction     object
dtype: object

In [21]:
# Convert tire_sizes to integer
banking['birth_dt'] = pd.to_datetime(banking['birth_date']).dt.date

today = dtt.date.today()

# Set all values above 27 to 27
banking.loc[banking['birth_dt'] > today, 'birth_dt'] = today


# Print tire size description
display(banking[['birth_date', 'birth_dt']])

Unnamed: 0,birth_date,birth_dt
0,1962-06-09,1962-06-09
1,1962-12-16,1962-12-16
2,1990-09-12,1990-09-12
3,1985-11-03,1985-11-03
4,1990-05-17,1990-05-17
...,...,...
95,1974-08-10,1974-08-10
96,1989-12-12,1989-12-12
97,1984-11-29,1984-11-29
98,1969-12-14,1969-12-14


## Uniqueness constraints

In [27]:
ride_sharing.head()

Unnamed: 0,duration,station_A_id,station_A_name,station_B_id,station_B_name,bike_id,user_type,user_birth_year,user_gender
0,12 minutes,81,Berry St at 4th St,323,Broadway at Kearny,5480,2,1959,Male
1,24 minutes,3,Powell St BART Station (Market St at 4th St),118,Eureka Valley Recreation Center,5193,2,1965,Male
2,8 minutes,67,San Francisco Caltrain Station 2 (Townsend St...,23,The Embarcadero at Steuart St,3652,3,1993,Male
3,4 minutes,16,Steuart St at Market St,28,The Embarcadero at Bryant St,1883,1,1979,Male
4,11 minutes,22,Howard St at Beale St,350,8th St at Brannan St,4626,2,1994,Male


In [22]:
# Find duplicates
duplicates = ride_sharing.duplicated(subset=['ride_id'], keep=False)

# Sort your duplicated rides
duplicated_rides = ride_sharing[duplicates].sort_values(by='ride_id')

# Print relevant columns of duplicated_rides
print(duplicated_rides[['ride_id','duration','user_birth_year']])

KeyError: Index(['ride_id'], dtype='object')

## Membership constraints (Text and categorical data problems)

In [29]:
airlines = pd.read_csv(r'./data/airlines_final.csv', index_col=0)
airlines.head()

Unnamed: 0,id,day,airline,destination,dest_region,dest_size,boarding_area,dept_time,wait_min,cleanliness,safety,satisfaction
0,1351,Tuesday,UNITED INTL,KANSAI,Asia,Hub,Gates 91-102,2018-12-31,115.0,Clean,Neutral,Very satisfied
1,373,Friday,ALASKA,SAN JOSE DEL CABO,Canada/Mexico,Small,Gates 50-59,2018-12-31,135.0,Clean,Very safe,Very satisfied
2,2820,Thursday,DELTA,LOS ANGELES,West US,Hub,Gates 40-48,2018-12-31,70.0,Average,Somewhat safe,Neutral
3,1157,Tuesday,SOUTHWEST,LOS ANGELES,West US,Hub,Gates 20-39,2018-12-31,190.0,Clean,Very safe,Somewhat satsified
4,2992,Wednesday,AMERICAN,MIAMI,East US,Hub,Gates 50-59,2018-12-31,559.0,Somewhat clean,Very safe,Somewhat satsified


In [31]:
categories = pd.DataFrame(
    {
      'cleanliness' : ['Clean', 'Average', 'Somewhat clean', 'Somewhat dirty', 'Dirty'],
      'safety' : ['Neutral', 'Very safe', 'Somewhat safe', 'Very unsafe', 'Somewhat unsafe'],
      'satisfaction' : ['Very satisfied', 'Neutral', 'Somewhat satisfied', 'Somewhat unsatisfied', 'Very unsatisfied']
    }
)

In [33]:
# Print categories DataFrame
display(categories)

# Print unique values of survey columns in airlines
print('Cleanliness: ', airlines['cleanliness'].unique(), "\n")
print('Safety: ', airlines['safety'].unique(), "\n")
print('Satisfaction: ', airlines['satisfaction'].unique(), "\n")

Unnamed: 0,cleanliness,safety,satisfaction
0,Clean,Neutral,Very satisfied
1,Average,Very safe,Neutral
2,Somewhat clean,Somewhat safe,Somewhat satisfied
3,Somewhat dirty,Very unsafe,Somewhat unsatisfied
4,Dirty,Somewhat unsafe,Very unsatisfied


Cleanliness:  ['Clean' 'Average' 'Somewhat clean' 'Somewhat dirty' 'Dirty'] 

Safety:  ['Neutral' 'Very safe' 'Somewhat safe' 'Very unsafe' 'Somewhat unsafe'] 

Satisfaction:  ['Very satisfied' 'Neutral' 'Somewhat satsified' 'Somewhat unsatisfied'
 'Very unsatisfied'] 



In [34]:
# Find the cleanliness category in airlines not in categories
cat_clean = set(airlines['cleanliness'].unique()).difference(categories['cleanliness'])

# Find rows with that category
cat_clean_rows = airlines['cleanliness'].isin(cat_clean)

# Print rows with inconsistent category
print(airlines[cat_clean_rows])

Empty DataFrame
Columns: [id, day, airline, destination, dest_region, dest_size, boarding_area, dept_time, wait_min, cleanliness, safety, satisfaction]
Index: []


In [35]:
# Print unique values of both columns
print(airlines['dest_region'].unique())
print(airlines['dest_size'].unique())

['Asia' 'Canada/Mexico' 'West US' 'East US' 'Midwest US' 'EAST US'
 'Middle East' 'Europe' 'eur' 'Central/South America'
 'Australia/New Zealand' 'middle east']
['Hub' 'Small' '    Hub' 'Medium' 'Large' 'Hub     ' '    Small'
 'Medium     ' '    Medium' 'Small     ' '    Large' 'Large     ']


In [37]:

# Lower dest_region column and then replace "eur" with "europe"
airlines['dest_region'] = airlines['dest_region'].str.lower()
airlines['dest_region'] = airlines['dest_region'].replace({'eur':'europe'})
airlines['dest_size'] = airlines['dest_size'].str.strip()

print(airlines['dest_region'].unique())
print(airlines['dest_size'].unique())

['asia' 'canada/mexico' 'west us' 'east us' 'midwest us' 'middle east'
 'europe' 'central/south america' 'australia/new zealand']
['Hub' 'Small' 'Medium' 'Large']


In [38]:
airlines['wait_min'].unique()

array([ 115.,  135.,   70.,  190.,  559.,  140.,   63.,  215.,  180.,
        540.,  192.,  107.,  155.,  175.,  100.,  225.,   60.,  145.,
        210.,  160.,  604.,  205.,  510.,  270.,  173.,   92.,  125.,
        120.,  900.,  150.,  330.,  110.,  415.,   76.,  165.,  122.,
         90.,   68.,   15.,   65.,  300.,   77.,  185.,   60.,   81.,
        105.,  170.,  245.,  355.,   85.,  177.,  130.,   50.,   32.,
        103.,  240.,  305.,   75.,  235.,   95.,  112.,  142.,   85.,
        265.,   97.,  131.,  514.,   75.,   98.,  166.,   65.,  139.,
        220.,   53.,  153.,  123.,  195.,  335.,  216.,  247.,   45.,
        250.,  182.,  119.,  295.,   55.,  362.,  179.,  260.,  243.,
         89.,   50.,   65.,   80.,  385.,  420.,  390.,  152.,  276.,
        515.,  200.,   82., 1365.,  224.,  350.,  183.,  607.,   55.,
        158.,  231.,  148.,  230.,  132.,  485.,  199.,  585.,  104.,
         87.,  143.,  213.,  334.,   88.,   80.,  211.,   62.,   95.,
        207.,   70.,

In [40]:
import numpy as np
# Create ranges for categories
label_ranges = [0, 60, 180, np.inf]
label_names = ['short', 'medium', 'long']

# Create wait_type column
airlines['wait_type'] = pd.cut(
    airlines['wait_min'],
    bins = label_ranges,
    labels = label_names
    )

# Create mappings and replace
mappings = {'Monday':'weekday', 'Tuesday':'weekday', 'Wednesday': 'weekday', 
            'Thursday': 'weekday', 'Friday': 'weekday', 
            'Saturday': 'weekend', 'Sunday': 'weekend'}

airlines['day_week'] = airlines['day'].replace(mappings)

print(airlines['wait_type'].unique())
print(airlines['day_week'].unique())

['medium', 'long', 'short']
Categories (3, object): ['short' < 'medium' < 'long']
['weekday' 'weekend']


In [41]:
airlines.head()

Unnamed: 0,id,day,airline,destination,dest_region,dest_size,boarding_area,dept_time,wait_min,cleanliness,safety,satisfaction,wait_type,day_week
0,1351,Tuesday,UNITED INTL,KANSAI,asia,Hub,Gates 91-102,2018-12-31,115.0,Clean,Neutral,Very satisfied,medium,weekday
1,373,Friday,ALASKA,SAN JOSE DEL CABO,canada/mexico,Small,Gates 50-59,2018-12-31,135.0,Clean,Very safe,Very satisfied,medium,weekday
2,2820,Thursday,DELTA,LOS ANGELES,west us,Hub,Gates 40-48,2018-12-31,70.0,Average,Somewhat safe,Neutral,medium,weekday
3,1157,Tuesday,SOUTHWEST,LOS ANGELES,west us,Hub,Gates 20-39,2018-12-31,190.0,Clean,Very safe,Somewhat satsified,long,weekday
4,2992,Wednesday,AMERICAN,MIAMI,east us,Hub,Gates 50-59,2018-12-31,559.0,Somewhat clean,Very safe,Somewhat satsified,long,weekday


In [43]:
airlines['full_name'].str.contains('Ms.|Mr.|Miss|Dr.').any() == True

KeyError: 'full_name'

In [None]:
# Replace "Dr." with empty string ""
airlines['full_name'] = airlines['full_name'].str.replace("Dr.","")

# Replace "Mr." with empty string ""
airlines['full_name'] = airlines['full_name'].str.replace('Mr.','')

# Replace "Miss" with empty string ""
airlines['full_name'] = airlines['full_name'].str.replace('Miss','')

# Replace "Ms." with empty string ""
airlines['full_name'] = airlines['full_name'].str.replace('Ms.','')

# Assert that full_name has no honorifics
assert airlines['full_name'].str.contains('Ms.|Mr.|Miss|Dr.').any() == False

In [50]:
def palindromes(start, num=3):
  lst = []
  while num>0:
    start += 1
    if str(start) == str(start)[::-1]:
      lst.append(start)
      num -= 1
  return lst

df = pd.DataFrame({'number':[103, 367, 818, 3045, 13672, 237847, 8304747, 39965382, 734203527]})
df['palindromes'] = df['number'].apply(lambda x: palindromes(x))

Unnamed: 0,number
0,103
1,367
2,818
3,3045
4,13672
5,237847
6,8304747
7,39965382
8,734203527


In [51]:
df['palindromes'] = df['number'].apply(lambda x: palindromes(x))

In [52]:
df

Unnamed: 0,number,palindromes
0,103,"[111, 121, 131]"
1,367,"[373, 383, 393]"
2,818,"[828, 838, 848]"
3,3045,"[3113, 3223, 3333]"
4,13672,"[13731, 13831, 13931]"
5,237847,"[238832, 239932, 240042]"
6,8304747,"[8305038, 8306038, 8307038]"
7,39965382,"[39966993, 39977993, 39988993]"
8,734203527,"[734212437, 734222437, 734232437]"


## Advanced data problems

In [53]:
banking.head()

Unnamed: 0,cust_id,birth_date,Age,acct_amount,inv_amount,fund_A,fund_B,fund_C,fund_D,account_opened,last_transaction
0,870A9281,1962-06-09,58,63523.31,51295,30105.0,4138.0,1420.0,15632.0,02-09-18,22-02-19
1,166B05B0,1962-12-16,58,38175.46,15050,4995.0,938.0,6696.0,2421.0,28-02-19,31-10-18
2,BFC13E88,1990-09-12,34,59863.77,24567,10323.0,4590.0,8469.0,1185.0,25-04-18,02-04-18
3,F2158F66,1985-11-03,35,84132.1,23712,3908.0,492.0,6482.0,12830.0,07-11-17,08-11-18
4,7A73F334,1990-05-17,30,120512.0,93230,12158.4,51281.0,13434.0,18383.0,14-05-18,19-07-18


In [None]:
# Find values of acct_cur that are equal to 'euro'
acct_eu = banking['acct_cur'] == 'euro'

# Convert acct_amount where it is in euro to dollars
banking.loc[acct_eu, 'acct_amount'] = banking.loc[acct_eu, 'acct_amount'] * 1.1

# Unify acct_cur column by changing 'euro' values to 'dollar'
banking.loc[acct_eu, 'acct_cur'] = 'dollar'

# Assert that only dollar currency remains
assert banking['acct_cur'].unique() == 'dollar'

In [58]:
pd.to_datetime(banking['account_opened']).dt.strftime

  pd.to_datetime(banking['account_opened'])


0    2018-02-09
1    2019-02-28
2    2018-04-25
3    2017-07-11
4    2018-05-14
        ...    
95   2018-05-26
96   2017-04-05
97   2017-08-16
98   2017-09-10
99   2017-01-08
Name: account_opened, Length: 100, dtype: datetime64[ns]

## Comparing strings

In [60]:
pip install thefuzz

Defaulting to user installation because normal site-packages is not writeable
Collecting thefuzz
  Downloading thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0 (from thefuzz)
  Downloading rapidfuzz-3.9.6-cp312-cp312-win_amd64.whl.metadata (12 kB)
Downloading thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Downloading rapidfuzz-3.9.6-cp312-cp312-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ---------------------------------------- 1.6/1.6 MB 11.1 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, thefuzz
Successfully installed rapidfuzz-3.9.6 thefuzz-0.22.1
Note: you may need to restart the kernel to use updated packages.


In [73]:
pip install fuzzywuzzy

Defaulting to user installation because normal site-packages is not writeable
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [67]:
import thefuzz as thf

In [68]:
restaurants = pd.read_csv(r'./data/restaurants_L2_dirty.csv')
restaurants.head(2)

Unnamed: 0.1,Unnamed: 0,name,addr,city,phone,type
0,0,kokomo,6333 w. third st.,la,2139330773,american
1,1,feenix,8358 sunset blvd. west,hollywood,2138486677,american


In [71]:
cities = restaurants['city'].unique()

In [110]:
pip install thefuzz

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [111]:
from thefuzz import process

In [None]:
process.extract

In [109]:
pip install recordlinkage

Defaulting to user installation because normal site-packages is not writeable
Collecting recordlinkage
  Downloading recordlinkage-0.16-py3-none-any.whl.metadata (8.1 kB)
Collecting jellyfish>=1 (from recordlinkage)
  Downloading jellyfish-1.1.0-cp312-none-win_amd64.whl.metadata (2.6 kB)
Collecting scipy>=1 (from recordlinkage)
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting scikit-learn>=1 (from recordlinkage)
  Downloading scikit_learn-1.5.1-cp312-cp312-win_amd64.whl.metadata (12 kB)
Collecting joblib (from recordlinkage)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn>=1->recordlinkage)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading recordlinkage-0.16-py3-none-any.whl (926 kB)
   ---------------------------------------- 0.0/926.9 kB ? eta -:--:--
   ---------------------------------------- 926.9/926.9 kB 7.1 MB/s eta 0:00:00
Downloading jellyfish-1.1.0-c