# Cleaning string data
Data cleaning is the process of fixing or removing incorrect, corrupted, incorrectly formatted, duplicate, or incomplete data within a dataset.
For strings this process starts by applying a list of common functions (lower, strip).  Then handling missing and duplicate entries. And finally applying custom functions as needed.

In [69]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Set max rows and columns displayed in jupyter
# pd.set_option("display.max_rows", 100)
# pd.set_option("display.max_columns", 20)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load a dataset

In [70]:
FILE_RAW = '../projects/proj1/data/Ask A Manager Salary Survey 2021 (Responses) - Form Responses 1.csv'
FILE_FEATHER = '../projects/proj1/data/AskAManagerSalarySurvey2021-Form Responses1.feather'


# FILE_RAW = '../projects/proj1/data/AAMFR.csv'
# FILE_FEATHER = '../projects/proj1/data/AAMFR.feather'

In [71]:
df=pd.read_csv(FILE_RAW)
df.head()
print(df.shape)

Unnamed: 0,Timestamp,How old are you?,Industry,Job title,Additional context on job title,Annual salary,Other monetary comp,Currency,Currency - other,Additional context on income,Country,State,City,Overall years of professional experience,Years of experience in field,Highest level of education completed,Gender,Race
0,4/27/2021 11:02:10,25-34,Education (Higher Education),Research and Instruction Librarian,,55000,0.0,USD,,,United States,Massachusetts,Boston,5-7 years,5-7 years,Master's degree,Woman,White
1,4/27/2021 11:02:22,25-34,Computing or Tech,Change & Internal Communications Manager,,54600,4000.0,GBP,,,United Kingdom,,Cambridge,8 - 10 years,5-7 years,College degree,Non-binary,White
2,4/27/2021 11:02:38,25-34,"Accounting, Banking & Finance",Marketing Specialist,,34000,,USD,,,US,Tennessee,Chattanooga,2 - 4 years,2 - 4 years,College degree,Woman,White
3,4/27/2021 11:02:41,25-34,Nonprofits,Program Manager,,62000,3000.0,USD,,,USA,Wisconsin,Milwaukee,8 - 10 years,5-7 years,College degree,Woman,White
4,4/27/2021 11:02:42,25-34,"Accounting, Banking & Finance",Accounting Manager,,60000,7000.0,USD,,,US,South Carolina,Greenville,8 - 10 years,5-7 years,College degree,Woman,White


(27609, 18)


In [72]:
# what type is country?  if object then it's a string
df.dtypes

Timestamp                                    object
How old are you?                             object
Industry                                     object
Job title                                    object
Additional context on job title              object
Annual salary                                object
Other monetary comp                         float64
Currency                                     object
Currency - other                             object
Additional context on income                 object
Country                                      object
State                                        object
City                                         object
Overall years of professional experience     object
Years of experience in field                 object
Highest level of education completed         object
Gender                                       object
Race                                         object
dtype: object

In [73]:
x=df['Country'].unique()
x.sort()
x

array([' New Zealand', ' U.S.', ' US', ' United States',
       '$2,175.84/year is deducted for benefits',
       'ARGENTINA BUT MY ORG IS IN THAILAND', 'Afghanistan', 'Africa',
       'America', 'Aotearoa New Zealand', 'Argentina', 'Australi',
       'Australia', 'Australia ', 'Australian ', 'Austria',
       'Austria, but I work remotely for a Dutch/British company',
       'Bangladesh', 'Bangladesh ', 'Belgium', 'Belgium ', 'Bermuda',
       'Brasil', 'Brazil', 'Brazil ', 'Britain ', 'Bulgaria', 'CANADA',
       'CANADA ', 'California ', 'Cambodia', 'Can', 'Canad', 'Canada',
       'Canada ', 'Canada and USA', 'Canada, Ottawa, ontario', 'Canadw',
       'Canadá', 'Canda', 'Catalonia', 'Cayman Islands', 'Chile', 'China',
       'Colombia', 'Company in Germany. I work from Pakistan.', 'Congo',
       'Contracts', 'Costa Rica', "Cote d'Ivoire", 'Croatia ', 'Csnada',
       'Cuba', 'Currently finance', 'Cyprus', 'Czech Republic',
       'Czech Republic ', 'Czech republic', 'Czechia', 'D

In [74]:
df['Country'].nunique()


364

In [75]:
df['Country'].value_counts()

Country
United States                                                    8844
USA                                                              7847
US                                                               2572
Canada                                                           1549
United States                                                     652
                                                                 ... 
Can                                                                 1
Sri Lanka                                                           1
Worldwide (based in US but short term trips aroudn the world)       1
Danmark                                                             1
U.S.A                                                               1
Name: count, Length: 364, dtype: int64

## Lets handle missing Country data first to ensure below algorithms run correctly.  
Lots of ways to do this, I'm going to do it the simple way, replace NaN with UNKNOWN

This is not a good idea in general, especially if you have a way to figure out the country from the other data present

In [76]:
#how many missing Countries
df.Country.isnull().sum()

0

In [8]:
# None missing, if there were you could fill with a Sentinal value for now, 
# like UNKNOWN, makes it easy to find and deal with later, also ensures that
# calculations do not fail because of NaNs
df.Country = df.Country.fillna('UNKNOWN')

In [None]:
(df.Country=='UNKNOWN').sum()

## There is a Country column, lets use it to get all the USA entries.  Take a look at the number of unique entries

In [77]:
#a small functions to see the number of unique country names
def p_unique(df=df,col='Country'):
    print(f'There are now {df[col].nunique()} unique {col} entries')
p_unique()

There are now 364 unique Country entries


In [78]:
#how many different countries are there
start_with_this_many__unique_countries = df.Country.nunique()
p_unique()

There are now 364 unique Country entries


## How many occurrences for each unique entry?

In [79]:
#lets see what we have
vc=df.Country.value_counts()
print(f'There are {len(vc)} unique entries')
# vc[-150:]
vc[:100]
# vc[50:100]

There are 364 unique entries


Country
United States              8844
USA                        7847
US                         2572
Canada                     1549
United States               652
                           ... 
New zealand                   4
United State of America       4
U. S.                         4
united States                 4
Great Britain                 3
Name: count, Length: 100, dtype: int64

## It looks like there was no filtering on what a user could enter in the Country field, ANyway lets get to it

## Apply lower and strip to get the easy gains

In [None]:
df.Country.unique()

In [80]:
df.Country = df.Country.map(str.lower).map(str.strip)

p_unique()

There are now 250 unique Country entries


In [81]:
df.Country.value_counts()

Country
united states          9842
usa                    8956
us                     2734
canada                 1656
uk                      683
                       ... 
mainland china            1
uk for u.s. company       1
canad                     1
cambodia                  1
canadá                    1
Name: count, Length: 250, dtype: int64

## If have punctuation, get rid of it all
Use regular expressions

In [15]:
import re

In [82]:
#the regular expressions package
import re
punc = "[!\"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~\`]"  #this is the punctuation to get rid of

#or a function
def fun1(x):
    #re.sub will remove any punction char found in punc 
    return re.sub(punc, '', x)

df.Country = df.Country.map(fun1)

#can do the same thing with a lambda
# df.Country = df.Country.map(lambda x: re.sub(punc, '', x))

p_unique()

There are now 236 unique Country entries


In [83]:
#lets see all unique country names that occur at leat n times
ds=df.Country.value_counts()
ds
#filter 1 offs
ds[ds>10]

Country
united states        9842
usa                  9014
us                   3349
canada               1656
uk                    695
                     ... 
uk for us company       1
canad                   1
cambodia                1
eritrea                 1
canadá                  1
Name: count, Length: 236, dtype: int64

Country
united states               9842
usa                         9014
us                          3349
canada                      1656
uk                           695
united kingdom               626
united states of america     481
australia                    382
germany                      190
england                      168
ireland                      124
new zealand                  121
france                        66
netherlands                   54
scotland                      45
spain                         44
sweden                        41
switzerland                   38
belgium                       34
the netherlands               31
japan                         29
denmark                       22
america                       21
united state                  19
south africa                  17
unites states                 17
austria                       16
finland                       16
india                         14
singapore                     14
no

In [84]:
#longest index
max([len(val) for val in ds.index])


205

In [85]:
longest_index = max(ds.index, key=len)
print(longest_index)

we dont get raises we get quarterly bonuses but they periodically asses income in the area you work so i got a raise because a 3rd party assessment showed i was paid too little for the area we were located


In [86]:
#sort by index if you want 
ds1=ds.sort_index(ascending=True)
ds1[:50]  #if you do them all then the really long index entry will knacker the output
# ds1

#turn index into column, turns series into dataframe
# can access Country now like any other column
df1=ds.reset_index()
df1
df1.sort_values(by=['Country'])
df1.sort_values(by=['count'],ascending=False)

Country
217584year is deducted for benefits                           1
afghanistan                                                   1
africa                                                        1
america                                                      21
aotearoa new zealand                                          1
argentina                                                     5
argentina but my org is in thailand                           1
australi                                                      1
australia                                                   382
australian                                                    1
austria                                                      16
austria but i work remotely for a dutchbritish company        1
bangladesh                                                    2
belgium                                                      34
bermuda                                                       2
bonus based on meeting yearly go

Unnamed: 0,Country,count
0,united states,9842
1,usa,9014
2,us,3349
3,canada,1656
4,uk,695
...,...,...
231,uk for us company,1
232,canad,1
233,cambodia,1
234,eritrea,1


Unnamed: 0,Country,count
199,217584year is deducted for benefits,1
222,afghanistan,1
142,africa,1
22,america,21
99,aotearoa new zealand,1
...,...,...
158,we dont get raises we get quarterly bonuses bu...,1
190,worldwide based in us but short term trips aro...,1
138,y,1
85,zimbabwe,2


Unnamed: 0,Country,count
0,united states,9842
1,usa,9014
2,us,3349
3,canada,1656
4,uk,695
...,...,...
141,uk but for globally fully remote company,1
142,africa,1
143,morocco,1
144,australian,1


## Looks like a lot of variations of 'united state'

Another easy gain, lets replace all strings with 'united state' in them with 'usa' 

In [21]:
def fun(x):
    """
    replaces any string that contains 'united state' with 'usa'
    BUT 'united state' and 'usa' are hardcoded, what if we wanted to use other values?
    """
    if 'united state' in x:
        return 'usa'
    return x

#dont do this yet there is a better more general way below
# df.Country = df.Country.map(fun);

## But you might want to do something similar with other strings, do you write another function?  Or do something a little more general?
Be general, always ... use a python closure.

## Closures
The problem we face is that map takes a function that takes 1 parameter, and we want it to take 3; The string value passed by map (call it x), the string to search for in x (call it str_to_find), and the string to replace x with if we find str_to_find in x (call it str_replacement).

We can't get around the fact that map only passes 1 parameter to the function.  But we can create a function that  already knows what str_to_find and str_replacement are.  Its called a closure

In [87]:
def fun1(str_to_find, str_replacement):
    """
    creates findandreplace Closure, which is a stateful function
    that remembers str_to_find and str_replacement values
    returns: findandreplace
    """
    def findandreplace(x):
        if str_to_find in x:
            return str_replacement
        return x
    # in python functions are first class objects
    # we are returning findandreplace, it in turn knows the value of 
    # str_to_find and str_replacement and takes an argument to search thru,x
    return findandreplace

#using the closure
# fn= fun1('us', 'usa')   #Uh Ohh, what about Australia? Or Austria or Belarus or Cyprus or Russia
                        #careful with this sort of thing!
# df.Country = df.Country.map(fn)

In [23]:
#using the closure one at a time

fn= fun1('usa', 'usa')
df.Country = df.Country.map(fn)

# fn= fun1('us', 'usa')#Uh Ohh, what about Australia? Or Austria or Belarus or Cyprus or Russia
                        #careful with this sort of thing!
df.Country = df.Country.map(fn)

fn= fun1('u s', 'usa')
df.Country = df.Country.map(fn)

fn= fun1('unites states', 'usa')  #its a bit suspicious that 17 people made this mistake
df.Country = df.Country.map(fn)

fn= fun1('united sates', 'usa')
df.Country = df.Country.map(fn)

fn= fun1('unitedstates', 'usa')
df.Country = df.Country.map(fn)

fn= fun1('united stares', 'usa')
df.Country = df.Country.map(fn)
#and so on

In [88]:
p_unique()

There are now 236 unique Country entries


In [89]:
#OR you can simplify the above with a list of str_to_find
#and just iterate over it
vals=['usa',  'u s', 'unites states', 'united sates', 'unitedstates', 'united stares', 'united stat', 'america']
for val in vals:
    fn= fun1(val, 'usa')
    df.Country = df.Country.map(fn)
p_unique()

There are now 199 unique Country entries


## Once you get down to the bottom of the unique values you will probably get a lot of one offs
For instance lets see what the the values are



In [90]:
# vals=df.Country.unique()
# vals.sort()
# vals
df.Country.value_counts()[:50]

Country
usa                       19466
us                         3349
canada                     1656
uk                          695
united kingdom              626
australia                   382
germany                     190
england                     168
ireland                     124
new zealand                 121
france                       66
netherlands                  54
scotland                     45
spain                        44
sweden                       41
switzerland                  38
belgium                      34
the netherlands              31
japan                        29
denmark                      22
south africa                 17
finland                      16
austria                      16
israel                       14
italy                        14
singapore                    14
india                        14
norway                       14
malaysia                     13
brazil                       11
philippines                   8


usa## Notice there are a lot of united states ish entries at the bottom, lets see if fuzzy wuzzy helps

In [27]:
fn= fun1('usa', 'usa')
df.Country = df.Country.map(fn)

In [None]:
# [val for val in df.Country if 'us' in val]
ds2=pd.Series([val for val in df.Country if 'us' in val])
ds2.value_counts()

In [91]:
def fun2(str_to_find, str_replacement):
    """
    creates findandreplace Closure, which is a stateful function
    that remembers str_to_find and str_replacement values
    returns: findandreplace
    """
    def findandreplace(x):
        if str_to_find == x:
            return str_replacement
        return x
    # in python functions are first class objects
    # we are returning findandreplace, it in turn knows the value of 
    # str_to_find and str_replacement and takes an argument to search thru,x
    return findandreplace


In [92]:
for val in ['us','the us','us govt employee overseas country withheld','we dont get raises we get quarterly bonuses but they periodically asses income in the area you work so i got a raise because a 3rd party assessment showed i was paid too little for the area we were located']:
    fn= fun2(val, 'usa')
    df.Country = df.Country.map(fn)

In [93]:
p_unique()

There are now 195 unique Country entries


In [32]:
# this package lives in the conda forge
# !conda install -c conda-forge fuzzywuzzy -y

In [33]:
# helpful modules
import fuzzywuzzy
from fuzzywuzzy import process

In [None]:
fuzzywuzzy.process.extract??

In [None]:
#looking for strings similar to 'unit', for return list
# first is the match, second is the score, third is the index in the dataframe
matches = fuzzywuzzy.process.extract("unit", df.Country, limit=150)
matches

In [None]:
#get the first match to prove that the index is the index in the dataframe
df.iloc[ matches[0][2]].Country

In [None]:
#they look pretty good, except for the united kingdom
#lets get all the unique matches in matches
#first all matches
l=[]
for mtch in matches:
    l.append(mtch[0])

# get unique values
l = set(l)
print(l)

In [None]:
#eyeball to get a list of bogus values
dont_sub_these=['united kingdom england', 'united arab emirates',  'united kingdom', 'united kindom']

#remove from l
for v in dont_sub_these:
    try:
        l.remove(v)
    except KeyError as e:
        pass
print(l)


In [None]:

#now call the above closure
for v in l:
    fn= fun1(v, 'usa')
    df.Country = df.Country.map(fn)

p_unique()

## Add a new column to track the changed columns
<mark> HOMEWORK: A better way would be to rename 'Country' to 'Country_original' and add a new column 'Country_computed' so we lose no data. <br>
    For this you need access to 2 columns in the dataframe, so map will not work, you should use apply

In [None]:
df.head()

In [None]:
%%time
# will receive a row of data and return a single value
def fun2(row):
    #have whole row to operate on
    if row.Country == 'usa':
        return True
    return False


#record whether usa has been found yet or not
df['found_in_db'] = df.apply(fun2, axis=1)

In [None]:
%%time
#do it all with map, much faster
df['found_in_db1'] = df.Country.map(lambda x: True if x=='usa' else False)

In [None]:
#verify the same results
print((df.found_in_db1 == True).sum())
print((df.found_in_db1 == False).sum())

print((df.found_in_db == True).sum())
print((df.found_in_db == False).sum())

# Lets see if we can do this for other countries.  It will be easier if we have a package that has a list of countries.

<a href="https://pypi.org/project/pycountry/">pycountry</a> is perfect.  It has a list of countries and a fuzzy lookup API. 


In [44]:
# !conda install -c conda-forge pycountry -y

In [45]:
import pycountry

### Demonstrate Pycountry: The following pycountry call will return a tupple if the country found, otherwise it will throw a LookupError

In [None]:
pycountry.countries.lookup('de')
pycountry.countries.lookup('germany')
pycountry.countries.lookup('usa').alpha_3
pycountry.countries.lookup('United States of America')
# pycountry.countries.lookup('united sates') #fails

In [None]:
set([val[0] for val in fuzzywuzzy.process.extract("unitee", df.Country, limit=150)])

In [48]:
# for val in ['uniited states', 'unite states']:
#     fn=fun2(val, 'usa')
#     df.Country = df.Country.map(fn)


In [None]:
#lets try a fuzzy search to seee if we get a few more
pycountry.countries.search_fuzzy('united')

## Lets apply this to the entire DataFrame

In [50]:
from tqdm.auto import tqdm  #tqdm gives you the neat little status bar, useful for very long running operations
tqdm.pandas()

In [None]:
def fix(df, i, cont, search_type):
    '''
    df: dataframe
    i: row
    cont: tuple from pycountry lookup hit 
    search_type: unused-what type of pycountry search was used ()
    '''
    # print(f' {search_type} {cont.alpha_3} for {df.loc[i, "Country"]}')
    df.loc[i, 'found_in_db'] = True  #found column
    df.loc[i, 'Country']=cont.alpha_3  #save the 3 letter alpha character for the country
def find_countries(df):
    #record whether pycountry finds result or not
    for i in tqdm(range(len(df))):  # for every row
        try:
            #try the most accurate lookup first
            res = pycountry.countries.lookup(df.loc[i, 'Country']) 
            #if you get here lookup returned result, use it
            fix(df, i, res, "Accurate swap")
        except LookupError as le:
            #lookup above failed, try fuzzy, you may get many results
            try:
                res = pycountry.countries.search_fuzzy(df.loc[i, 'Country'])
                #if you get here search_fuzzy returned result, should you use it?
                #SHOULD CHECK CONFIDENCE, THIS CODE DOES NOT
                fix(df, i, res[0], "Fuzzy swap")
            except LookupError as le:
                pass
    return df


df = find_countries(df)

In [None]:
df.head()

In [None]:
df.Country.value_counts()
p_unique()

## Final Tally

In [None]:
numb_found = df.found_in_db.sum()
print(f' Number countries identified {numb_found}')
print(f' Number countries unidentified {len(df)-numb_found}')
print(f' Number unique countries {df.Country.nunique()}')

In [None]:
df[df['found_in_db']==False].Country.value_counts()

In [None]:
pycountry.countries.search_fuzzy('United Kingdom')

## Lets see what the top 10 countries are

In [None]:
df_tt=df.groupby('Country').count().sort_values(by='found_in_db',ascending=False)
df_tt.found_in_db

In [None]:

df_tmp=df_tt.iloc[:10,:]
sns.barplot(data=df_tmp,x=df_tmp.index, y='found_in_db');

## When finished saved processed data for further evaluation

In [58]:
# save to feather format
df.to_feather(FILE_FEATHER)

## testing

In [None]:
text="A big fish in a small pond"
STOPWORDS=['in','a']
" ".join([word for word in str(text).split() if word not in STOPWORDS])