Load libraries and data set.

In [1]:
# modules we'll use
import pandas as pd
import numpy as np

# helpful modules
import fuzzywuzzy
from fuzzywuzzy import process
import chardet

# read in all our data
professors = pd.read_csv("~/Desktop/Kaggle/practice/Data_Cleaning/pakistan_intellectual_capital.csv")

# set seed for reproducibility
np.random.seed(0)

Function to replace rows in the provided column of the provided dataframe that match the provided string above the provided ratio with the provided string.

In [2]:
# convert to lower case
professors['Country'] = professors['Country'].str.lower()
# remove trailing white spaces
professors['Country'] = professors['Country'].str.strip()

# get the top 10 closest matches to "south korea"
countries = professors['Country'].unique()
matches = fuzzywuzzy.process.extract("south korea", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
    
    # let us know the function's done
    print("All done!")
    
replace_matches_in_column(df=professors, column='Country', string_to_match="south korea")
countries = professors['Country'].unique()

All done!


# 1) Examine another column

Write code below to take a look at all the unique values in the "Graduated from" column.

In [3]:
#pulls unique valuse
grad_from = professors['Graduated from'].unique()

#sorts alphabetically
grad_from.sort()
print(grad_from)

[' Columbia University' ' Delft University of Technology'
 ' Iowa State University' ' University of Central Florida'
 ' University of Innsbruck' ' University of Texas at Arlington (UTA)'
 ' University of Turin' 'Abasyn University'
 'Abdul Wali Khan University, Mardan'
 'Abdus Salam School of Mathematical Sciences,GC University'
 'Agricultural University Peshawar' 'Allama Iqbal Open University'
 'Asian Institute of Technology' 'Aston University, Birmingham'
 'Australian National University, Caneberra' 'BUKC'
 'Bahauddin Zakariya University' 'Bahria University'
 'Bahria University,Islamabad'
 'Balochistan University of Information Technology, Engineering and Management Sciences'
 'Barani Institute of Information Technology'
 'Beaconhouse National University' 'Beihang University'
 'Beijing Institute of Technology'
 'Beijing Institute of Technology Beijing'
 'Beijing University of Posts & Telecommunications'
 'Biztek Institute Of Business & Technology,Karachi'
 'Blekinge Institute of Techn

Do you notice any inconsistencies in the data?  Can any of the inconsistencies in the data be fixed by removing white spaces at the beginning and end of cells?

- There are inconsistencies that can be fixed by removing white spaces at the beginning and end of cells. For instance, "University of Central Florida" and " University of Central Florida" both appear in the column.

# 2) Do some text pre-processing

Convert every entry in the "Graduated from" column in the `professors` DataFrame to remove white spaces at the beginning and end of cells.

In [4]:
professors['Graduated from']=professors['Graduated from'].str.strip()

# 3) Continue working with countries

Take another look at the "Country" column and see if there's any more data cleaning we need to do.


In [5]:
# get all the unique values in the 'City' column
countries = professors['Country'].unique()

# sort them alphabetically
countries.sort()
countries

array(['australia', 'austria', 'canada', 'china', 'finland', 'france',
       'germany', 'greece', 'hongkong', 'ireland', 'italy', 'japan',
       'macau', 'malaysia', 'mauritius', 'netherland', 'new zealand',
       'norway', 'pakistan', 'portugal', 'russian federation',
       'saudi arabia', 'scotland', 'singapore', 'south korea', 'spain',
       'sweden', 'thailand', 'turkey', 'uk', 'urbana', 'usa', 'usofa'],
      dtype=object)

Take another look at the "Country" column and see if there's any more data cleaning we need to do.

It looks like 'usa' and 'usofa' should be the same country.  Correct the "Country" column in the dataframe so that 'usofa' appears instead as 'usa'.

In [6]:
#find matches value
matches = fuzzywuzzy.process.extract("usa", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
print(matches)

[('usa', 100), ('usofa', 75), ('austria', 60), ('australia', 50), ('spain', 50), ('urbana', 44), ('uk', 40), ('malaysia', 36), ('pakistan', 36), ('portugal', 36)]
All done!


In [7]:
#use function to replace 'usofa' with 'usa'
replace_matches_in_column(df=professors, column='Country', string_to_match="usa", min_ratio=70)

All done!
