# CIV1498 - Introduction to Data Science
## Lecture 3.3 - Cleaning Data

## Setup Notebook

In [1]:
# Import 3rd party libraries
import chardet
import pandas as pd

## 1. Character Encoding

In [2]:
text = 'This item costs 10.45¥'
ascii_encoded = text.encode('ASCII', errors='replace')
ascii_decoded = ascii_encoded.decode('ASCII', errors='replace')
print(ascii_decoded)

This item costs 10.45?


In [3]:
text = 'This item costs 10.45¥'
utf8_encoded = text.encode('UTF-8', errors='replace')
utf8_decoded = utf8_encoded.decode('UTF-8', errors='replace')
print(utf8_decoded)

This item costs 10.45¥


In [4]:
data = pd.read_csv('sample_data.csv')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x99 in position 7955: invalid start byte

In [5]:
with open('sample_data.csv', 'rb') as raw_data:
    result = chardet.detect(raw_data.read(100000))

print(result)

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}


In [7]:
data = pd.read_csv('sample_data.csv', encoding='Windows-1252')
data.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000,2015-08-11 12:12:28,0,failed,0,GB,0,,,,
1,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000,2013-01-12 00:20:50,220,failed,3,US,220,,,,
2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 04:24:11,5000,2012-03-17 03:24:11,1,failed,1,US,1,,,,
3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 01:00:00,19500,2015-07-04 08:35:03,1283,canceled,14,US,1283,,,,
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01 13:38:27,50000,2016-02-26 13:38:27,52375,successful,224,US,52375,,,,


# Inconsistent Data

In [8]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [9]:
import fuzzywuzzy
from fuzzywuzzy import process

names = ['Yonge Street', 'Yonge St.', 'Young Street', 
         'Queen St.', 'University Ave.', 'Bloor Street']

# Get the 10 closest matches to 'Yonge Street'
matches = process.extract('Yonge Street', 
                          names, 
                          limit=10, 
                          scorer=fuzzywuzzy.fuzz.token_sort_ratio)
matches



[('Yonge Street', 100),
 ('Young Street', 92),
 ('Yonge St.', 80),
 ('Bloor Street', 50),
 ('University Ave.', 23),
 ('Queen St.', 20)]