In [26]:
!pip install pandas==0.25.1



### Part A - Jaccard Similarity Measures

For part A we will be using Fast Food location datasets found from Kaggle and data.world. The respective links are below. 

data.world https://data.world/data-hut/subway-restaurant-location-dataset

Kaggle https://www.kaggle.com/datafiniti/fast-food-restaurants?select=FastFoodRestaurants.csv

In [36]:
# Importing Data

import pandas as pd
import numpy as np
import spacy
import re

nlp = spacy.load('en')

kaggle = pd.read_csv('FastFoodRestaurants.csv')
dataworld = pd.read_csv('subway_2018_11_06.csv')

In [2]:
# Cleaning Data

# Duplicates

# No duplicates were found in dataworld or in kaggle -> confirmed by number of rows remaining the same after each function

dataworld_dropdup = dataworld.drop_duplicates()
dataworld_dropdup

Unnamed: 0,name,url,street_address,city,state,zip_code,country,phone_number_1,phone_number_2,fax_1,fax_2,email_1,email_2,website,open_hours,latitude,longitude,facebook,twitter,instagram,pinterest,youtube
0,Subway,http://order.subway.com/Stores/Redirect.aspx?s...,"1800 Duke St, Ste 100",Alexandria,VA,22314,USA,,,,,,,,"Monday 7:00 AM - 7:00 PM, Tuesday 7:00 AM - 7:...",38.8043,-77.0611,,,,,
1,Subway,http://order.subway.com/Stores/Redirect.aspx?s...,"1512 King St,",Alexandria,VA,22301,USA,,,,,,,,"Sunday 9:00 AM - 8:00 PM, Monday 7:00 AM - 9:0...",38.8062,-77.0565,,,,,
2,Subway,http://order.subway.com/Stores/Redirect.aspx?s...,"2361 Eisenhower Ave,",Alexandria,VA,22314,USA,,,,,,,,"Sunday 9:00 AM - 8:30 PM, Monday 6:30 AM - 9:0...",38.8012,-77.0691,,,,,
3,Subway,http://order.subway.com/Stores/Redirect.aspx?s...,"320 King Street, 1st Floor",Alexandria,VA,22314,USA,,,,,,,,"Sunday 9:00 AM - 8:00 PM, Monday 7:00 AM - 9:0...",38.8045,-77.0433,,,,,
4,Subway,http://order.subway.com/Stores/Redirect.aspx?s...,"5836 N. Kings Hwy, Suite A, Huntington Station",Alexandria,VA,22303,USA,,,,,,,,"Sunday 9:00 AM - 9:00 PM, Monday 7:00 AM - 10:...",38.7903,-77.0769,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25528,Subway,http://order.subway.com/Stores/Redirect.aspx?s...,"300 110th Ave NE, Unit B1-03, Abella Condominium",Bellevue,WA,98004,USA,,,,,,,,"Sunday 9:00 AM - 6:00 PM, Monday 8:00 AM - 10:...",47.6139,-122.1932,,,,,
25529,Subway,http://order.subway.com/Stores/Redirect.aspx?s...,"10615 NE 68th Street, Houghton Shopping Center",Kirkland,WA,98033,USA,,,,,,,,"Sunday 9:00 AM - 9:00 PM, Monday 9:00 AM - 9:0...",47.6660,-122.1979,,,,,
25530,Subway,http://order.subway.com/Stores/Redirect.aspx?s...,"255 Central Way, Market On Central",Kirkland,WA,98033,USA,,,,,,,,"Sunday 9:00 AM - 9:00 PM, Monday 7:00 AM - 10:...",47.6768,-122.2042,,,,,
25531,Subway,http://order.subway.com/Stores/Redirect.aspx?s...,"1220 Howell St, 1st Floor, Metropolitan Park N...",Seattle,WA,98101,USA,,,,,,,,"Sunday 9:00 AM - 8:00 PM, Monday 7:00 AM - 10:...",47.6169,-122.3306,,,,,


In [3]:
kaggle_dropdup = kaggle.drop_duplicates()
kaggle_dropdup

Unnamed: 0,address,city,country,keys,latitude,longitude,name,postalCode,province,websites
0,324 Main St,Massena,US,us/ny/massena/324mainst/-1161002137,44.921300,-74.890210,McDonald's,13662,NY,"http://mcdonalds.com,http://www.mcdonalds.com/..."
1,530 Clinton Ave,Washington Court House,US,us/oh/washingtoncourthouse/530clintonave/-7914...,39.532550,-83.445260,Wendy's,43160,OH,http://www.wendys.com
2,408 Market Square Dr,Maysville,US,us/ky/maysville/408marketsquaredr/1051460804,38.627360,-83.791410,Frisch's Big Boy,41056,KY,"http://www.frischs.com,https://www.frischs.com..."
3,6098 State Highway 37,Massena,US,us/ny/massena/6098statehighway37/-1161002137,44.950080,-74.845530,McDonald's,13662,NY,"http://mcdonalds.com,http://www.mcdonalds.com/..."
4,139 Columbus Rd,Athens,US,us/oh/athens/139columbusrd/990890980,39.351550,-82.097280,OMG! Rotisserie,45701,OH,"http://www.omgrotisserie.com,http://omgrotisse..."
...,...,...,...,...,...,...,...,...,...,...
9995,3013 Peach Orchard Rd,Augusta,US,us/ga/augusta/3013peachorchardrd/-791445730,33.415257,-82.024531,Wendy's,30906,GA,"http://www.wendys.com,http://wendys.com"
9996,678 Northwest Hwy,Cary,US,us/il/cary/678northwesthwy/787691191,42.217300,-88.255800,Lee's Oriental Martial Arts,60013,IL,http://www.mcdonalds.com
9997,1708 Main St,Longmont,US,us/co/longmont/1708mainst/-448666054,40.189190,-105.101720,Five Guys,80501,CO,http://fiveguys.com
9998,67740 Highway 111,Cathedral City,US,us/ca/cathedralcity/67740highway111/-981164808,33.788640,-116.482150,El Pollo Loco,92234,CA,"http://www.elpolloloco.com,http://elpolloloco.com"


In [56]:
dataworld_dropdup.shape

(25533, 22)

In [55]:
dataworld.shape

(25533, 22)

In [58]:
kaggle_dropdup.shape

(10000, 10)

In [59]:
kaggle.shape

(10000, 10)

In [4]:
# Kaggle dataset has missing values for websites. We replaced them with the single character 'X'
# Dataworld dataset has all missing values for 12 attributes, and open_hours and url were missing a few. We replaced them with the single character 'X'

# Missing Values

dataworld_dropna = dataworld.replace(np.nan, "X")
kaggle_dropna = kaggle.replace(np.nan, "X")

# Dropping Unnecessary columns in Dataworld

dataworld_un = dataworld_dropna[["name", "url", "street_address", "city", "state", "zip_code", "country", "website", "open_hours", "latitude", "longitude"]]

dataworld_clean = dataworld_un
kaggle_clean = kaggle_dropna

kaggle_clean[kaggle_clean['websites'] == 'X']

Unnamed: 0,address,city,country,keys,latitude,longitude,name,postalCode,province,websites
37,218 W Main St,Ada,US,us/ok/ada/218wmainst/-1173852384,34.774820,-96.681260,Hamburger King,74820,OK,X
137,161 Church St,Burlington,US,us/vt/burlington/161churchst/-1600724499,44.475680,-73.212640,Pacific Rim,5401,VT,X
262,3596 MT Diablo Blvd,Lafayette,US,us/ca/lafayette/3596mtdiabloblvd/-1322134216,37.891487,-122.123350,Baja Fresh,94549,CA,X
292,309 S Pearson Rd.,Pearl,US,us/al/pearl/309spearsonrd./1082608893,32.269050,-90.135680,Waffle House,39208,AL,X
303,6466 Poplar Ave,Memphis,US,us/tn/memphis/6466poplarave/126846685,35.099798,-89.848067,Back Yard Burgers,38119,TN,X
...,...,...,...,...,...,...,...,...,...,...
9959,2505 W March Ln,Stockton,US,us/ca/stockton/2505wmarchln/-1161002137,37.985350,-121.338350,McDonald's,95207,CA,X
9961,11503 S 4000 W,S Jordan,US,us/ut/sjordan/11503s4000w/-791445730,40.543316,-111.985750,Wendy's,84009,UT,X
9976,170 E Taylor St,San Jose,US,us/ca/sanjose/170etaylorst/1412731209,37.350692,-121.896548,El Tarasco Mexican Food,95112,CA,X
9978,Us Hwy 29,Chatham,US,us/va/chatham/ushwy29/-1161002137,36.789794,-79.394511,Mcdonald's,24531,VA,X


In [5]:
dataworld_clean[dataworld_clean['open_hours'] == 'X']

Unnamed: 0,name,url,street_address,city,state,zip_code,country,website,open_hours,latitude,longitude
52,Subway,X,"1700 Highway One,",Dewey Beach,DE,19971,USA,X,X,39.0833,-75.4578
61,Subway,http://order.subway.com/Stores/Redirect.aspx?s...,"23A Rehoboth Avenue, Subway/Beach Resort",Rehoboth Beach,DE,19971,USA,X,X,38.7169,-75.0769
76,Subway,X,"218 Baltimore Ave, Monte Carlo Hotel",Ocean City,MD,21842,USA,X,X,38.3344,-75.0846
78,Subway,X,"1610 Baltimore Ave., Quality Inn Boardwalk Hotel",Ocean City,MD,21842,USA,X,X,38.3468,-75.0787
79,Subway,X,"1201 Atlantic Ave,",Ocean City,MD,21842,USA,X,X,38.3419,-75.0793
...,...,...,...,...,...,...,...,...,...,...,...
24251,Subway,X,"651 Newell Drive, Rawlings Hall, University of...",Gainesville,FL,32611,USA,X,X,29.6465,-82.3435
24592,Subway,X,"4807 Valley View Blvd NW, Wal-Mart #2312",Roanoke,VA,24012,USA,X,X,37.3008,-79.9607
24933,Subway,X,"1921 W. Mitchell St,",Milwaukee,WI,53204,USA,X,X,43.0123,-87.9371
25235,Subway,X,"801 Alaskan Way, Colman Dock/ Pier 52, Seattle...",Seattle,WA,98104,USA,X,X,47.6027,-122.3383


In [50]:
dataworld.shape

(25533, 22)

In [51]:
kaggle.shape

(10000, 10)

In [52]:
dataworld.count()

name              25533
url               24017
street_address    25533
city              25533
state             25533
zip_code          25533
country           25533
phone_number_1        0
phone_number_2        0
fax_1                 0
fax_2                 0
email_1               0
email_2               0
website               0
open_hours        25395
latitude          25533
longitude         25533
facebook              0
twitter               0
instagram             0
pinterest             0
youtube               0
dtype: int64

In [53]:
kaggle.count()

address       10000
city          10000
country       10000
keys          10000
latitude      10000
longitude     10000
name          10000
postalCode    10000
province      10000
websites       9535
dtype: int64

In [8]:
# Visually Inspecting and Displaying Rows of Data

kaggle_clean.shape

(10000, 10)

In [9]:
dataworld_clean.shape

(25533, 11)

In [10]:
# address is the address of the restaurant, city is the city the restaurant is located in, country is the country the restaurant is located in, keys is API key used to access the information in the entity, latitude is the latitude 
# location of the restaurant, longitude is the longitude location of the restaurant, name is the name of the restaurant, postalCode is the zipcode of the restaurant, province is the state the restaurant is located in
# websites is the website url of the website related to the restaurant

kaggle_clean.head(5)

Unnamed: 0,address,city,country,keys,latitude,longitude,name,postalCode,province,websites
0,324 Main St,Massena,US,us/ny/massena/324mainst/-1161002137,44.9213,-74.89021,McDonald's,13662,NY,"http://mcdonalds.com,http://www.mcdonalds.com/..."
1,530 Clinton Ave,Washington Court House,US,us/oh/washingtoncourthouse/530clintonave/-7914...,39.53255,-83.44526,Wendy's,43160,OH,http://www.wendys.com
2,408 Market Square Dr,Maysville,US,us/ky/maysville/408marketsquaredr/1051460804,38.62736,-83.79141,Frisch's Big Boy,41056,KY,"http://www.frischs.com,https://www.frischs.com..."
3,6098 State Highway 37,Massena,US,us/ny/massena/6098statehighway37/-1161002137,44.95008,-74.84553,McDonald's,13662,NY,"http://mcdonalds.com,http://www.mcdonalds.com/..."
4,139 Columbus Rd,Athens,US,us/oh/athens/139columbusrd/990890980,39.35155,-82.09728,OMG! Rotisserie,45701,OH,"http://www.omgrotisserie.com,http://omgrotisse..."


In [11]:
# Name represent name of the restaurant, url represents url used to find the information for the entity such as city, state, lat, long., street_address is the street address of the restaurant, city is the city the 
# restaurant is located, state is the state the restaurant is located in, zip_code is the zip code of the restaurant, country is the country in which the restaurant is located in, phone_number is the phone number of the restaurant,
# fax is the fax of the restaurant, email is the email of the restaurant, website is the website of the restaurant, open_hours is the hours the restaurant is open, latitude is the latitude number the restaurant is located at, 
# longitude is the longitude number the restaurant is located at, facebook is the link for the facebook page of the restaurant, twitter is the link for the twitter page of the restaurant, instagram is the link for the instagram page of the 
# restaurant, pinterest is the link for the pinterest page of the restaurant, youtube is the link for the youtube page of the restaurant.

dataworld_clean.head(5)

Unnamed: 0,name,url,street_address,city,state,zip_code,country,website,open_hours,latitude,longitude
0,Subway,http://order.subway.com/Stores/Redirect.aspx?s...,"1800 Duke St, Ste 100",Alexandria,VA,22314,USA,X,"Monday 7:00 AM - 7:00 PM, Tuesday 7:00 AM - 7:...",38.8043,-77.0611
1,Subway,http://order.subway.com/Stores/Redirect.aspx?s...,"1512 King St,",Alexandria,VA,22301,USA,X,"Sunday 9:00 AM - 8:00 PM, Monday 7:00 AM - 9:0...",38.8062,-77.0565
2,Subway,http://order.subway.com/Stores/Redirect.aspx?s...,"2361 Eisenhower Ave,",Alexandria,VA,22314,USA,X,"Sunday 9:00 AM - 8:30 PM, Monday 6:30 AM - 9:0...",38.8012,-77.0691
3,Subway,http://order.subway.com/Stores/Redirect.aspx?s...,"320 King Street, 1st Floor",Alexandria,VA,22314,USA,X,"Sunday 9:00 AM - 8:00 PM, Monday 7:00 AM - 9:0...",38.8045,-77.0433
4,Subway,http://order.subway.com/Stores/Redirect.aspx?s...,"5836 N. Kings Hwy, Suite A, Huntington Station",Alexandria,VA,22303,USA,X,"Sunday 9:00 AM - 9:00 PM, Monday 7:00 AM - 10:...",38.7903,-77.0769


In [64]:
dataworld_clean.columns

Index(['name', 'url', 'street_address', 'city', 'state', 'zip_code', 'country',
       'website', 'open_hours', 'latitude', 'longitude'],
      dtype='object')

In [65]:
kaggle_clean.columns

Index(['address', 'city', 'country', 'keys', 'latitude', 'longitude', 'name',
       'postalCode', 'province', 'websites'],
      dtype='object')

In [177]:
# 2-gram

def getGrams(array):
  grams = []
  for text in array:
      # pads the text to show beginning and end of string
      text_padded = "$" + str(text) + "$"
      #text_padded = "$$" + str(text) + "$$"
      # traverses through the string and takes n-grams
      length = len(text_padded) - 1
      #length = len(text_padded) - 2
      for i in range(length):
          first = text_padded[i]
          second = text_padded[i+1]
          #third = text_padded[i+2]
          gram = first + second #+ third
          grams.append(gram)
      # appends gram to list of grams
  return grams

In [178]:
# Jaccard Similarity Measure - 2gram

# address

# Take the set of all the tokens 
col_vals = set(getGrams(kaggle_clean['address'].unique()))
jaccard_list = []
names = {}
# Get the jaccard similarity score for each attribute of dataworld compared to address from kaggle and save it
for col in dataworld_clean.columns:
    ext_col_vals = set(getGrams(dataworld_clean[col].unique()))
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    jaccard_list.append(jaccard)
    names[jaccard] = col
# Take the maximum similarity score from the jaccard similarity score list
maxed = max(jaccard_list)
# if the maxed is not zero print the similarity score, otherwise all of the similarity scores are 0
if maxed != 0:
    print(names[maxed] + ' has similarity score ' + str(maxed))
else:
    print("All similarity scores are 0")

city has similarity score 0.5299420476497102


In [179]:
# city

col_vals = set(kaggle_clean['city'].unique())
jaccard_list = []
names = {}
for col in dataworld_clean.columns:
    ext_col_vals = set(getGrams(dataworld_clean[col].unique()))
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    jaccard_list.append(jaccard)
    names[jaccard] = col
maxed = max(jaccard_list)
if maxed != 0:
    print(names[maxed] + ' has similarity score ' + str(maxed))
else:
    print("All similarity scores are 0")

All similarity scores are 0


In [180]:
# country

col_vals = set(kaggle_clean['country'].unique())
jaccard_list = []
names = {}
for col in dataworld_clean.columns:
    ext_col_vals = set(getGrams(dataworld_clean[col].unique()))
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    jaccard_list.append(jaccard)
    names[jaccard] = col
maxed = max(jaccard_list)
if maxed != 0:
    print(names[maxed] + ' has similarity score ' + str(maxed))
else:
    print("All similarity scores are 0")

country has similarity score 0.25


In [181]:
# keys

col_vals = set(kaggle_clean['keys'].unique())
jaccard_list = []
names = {}
for col in dataworld_clean.columns:
    ext_col_vals = set(getGrams(dataworld_clean[col].unique()))
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    jaccard_list.append(jaccard)
    names[jaccard] = col
maxed = max(jaccard_list)
if maxed != 0:
    print(names[maxed] + ' has similarity score ' + str(maxed))
else:
    print("All similarity scores are 0")

All similarity scores are 0


In [182]:
# latitude

col_vals = set(kaggle_clean['latitude'].unique())
jaccard_list = []
names = {}
for col in dataworld_clean.columns:
    ext_col_vals = set(getGrams(dataworld_clean[col].unique()))
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    jaccard_list.append(jaccard)
    names[jaccard] = col
maxed = max(jaccard_list)
if maxed != 0:
    print(names[maxed] + ' has similarity score ' + str(maxed))
else:
    print("All similarity scores are 0")

All similarity scores are 0


In [183]:
# longitude

col_vals = set(kaggle_clean['longitude'].unique())
jaccard_list = []
names = {}
for col in dataworld_clean.columns:
    ext_col_vals = set(getGrams(dataworld_clean[col].unique()))
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    jaccard_list.append(jaccard)
    names[jaccard] = col
maxed = max(jaccard_list)
if maxed != 0:
    print(names[maxed] + ' has similarity score ' + str(maxed))
else:
    print("All similarity scores are 0")

All similarity scores are 0


In [184]:
# name

col_vals = set(kaggle_clean['name'].unique())
jaccard_list = []
names = {}
for col in dataworld_clean.columns:
    ext_col_vals = set(getGrams(dataworld_clean[col].unique()))
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    jaccard_list.append(jaccard)
    names[jaccard] = col
maxed = max(jaccard_list)
if maxed != 0:
    print(names[maxed] + ' has similarity score ' + str(maxed))
else:
    print("All similarity scores are 0")

All similarity scores are 0


In [185]:
# postalcode

col_vals = set(kaggle_clean['postalCode'].unique())
jaccard_list = []
names = {}
for col in dataworld_clean.columns:
    ext_col_vals = set(getGrams(dataworld_clean[col].unique()))
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    jaccard_list.append(jaccard)
    names[jaccard] = col
maxed = max(jaccard_list)
if maxed != 0:
    print(names[maxed] + ' has similarity score ' + str(maxed))
else:
    print("All similarity scores are 0")

All similarity scores are 0


In [186]:
# province

col_vals = set(kaggle_clean['province'].unique())
jaccard_list = []
names = {}
for col in dataworld_clean.columns:
    ext_col_vals = set(getGrams(dataworld_clean[col].unique()))
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    jaccard_list.append(jaccard)
    names[jaccard] = col
maxed = max(jaccard_list)
if maxed != 0:
    print(names[maxed] + ' has similarity score ' + str(maxed))
else:
    print("All similarity scores are 0")

state has similarity score 0.5604395604395604


In [187]:
# websites

col_vals = set(kaggle_clean['websites'].unique())
jaccard_list = []
names = {}
for col in dataworld_clean.columns:
    ext_col_vals = set(getGrams(dataworld_clean[col].unique()))
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    jaccard_list.append(jaccard)
    names[jaccard] = col
maxed = max(jaccard_list)
if maxed != 0:
    print(names[maxed] + ' has similarity score ' + str(maxed))
else:
    print("All similarity scores are 0")

All similarity scores are 0


In [198]:
# Jaccard Similarity Score - Word Tokens

# address

col_vals = set(kaggle_clean['address'].unique())
for col in dataworld_clean.columns:
    ext_col_vals = set(dataworld_clean[col].unique())
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    if jaccard != 0:
        print(col + ' has similarity score ' + str(jaccard))

In [189]:
# city

col_vals = set(kaggle_clean['city'].unique())
for col in dataworld_clean.columns:
    ext_col_vals = set(dataworld_clean[col].unique())
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    if jaccard != 0:
        print(col + ' has similarity score ' + str(jaccard))

city has similarity score 0.34847875619890095


In [190]:
# country

col_vals = set(kaggle_clean['country'].unique())
for col in dataworld_clean.columns:
    ext_col_vals = set(dataworld_clean[col].unique())
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    if jaccard != 0:
        print(col + ' has similarity score ' + str(jaccard))

In [191]:
# keys

col_vals = set(kaggle_clean['keys'].unique())
for col in dataworld_clean.columns:
    ext_col_vals = set(dataworld_clean[col].unique())
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    if jaccard != 0:
        print(col + ' has similarity score ' + str(jaccard))

In [192]:
#latitude

col_vals = set(kaggle_clean['latitude'].unique())
for col in dataworld_clean.columns:
    ext_col_vals = set(dataworld_clean[col].unique())
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    if jaccard != 0:
        print(col + ' has similarity score ' + str(jaccard))

latitude has similarity score 0.009634087041089229


In [193]:
# longitude

col_vals = set(kaggle_clean['longitude'].unique())
for col in dataworld_clean.columns:
    ext_col_vals = set(dataworld_clean[col].unique())
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    if jaccard != 0:
        print(col + ' has similarity score ' + str(jaccard))

longitude has similarity score 0.004784127263655308


In [194]:
# name

col_vals = set(kaggle_clean['name'].unique())
for col in dataworld_clean.columns:
    ext_col_vals = set(dataworld_clean[col].unique())
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    if jaccard != 0:
        print(col + ' has similarity score ' + str(jaccard))

name has similarity score 0.0018248175182481751


In [195]:
# postalCode

col_vals = set(kaggle_clean['postalCode'].unique())
for col in dataworld_clean.columns:
    ext_col_vals = set(dataworld_clean[col].unique())
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    if jaccard != 0:
        print(col + ' has similarity score ' + str(jaccard))

zip_code has similarity score 0.30012085403518196


In [196]:
# province

col_vals = set(kaggle_clean['province'].unique())
for col in dataworld_clean.columns:
    ext_col_vals = set(dataworld_clean[col].unique())
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    if jaccard != 0:
        print(col + ' has similarity score ' + str(jaccard))

state has similarity score 0.9622641509433962


In [197]:
# websites

col_vals = set(kaggle_clean['websites'].unique())
for col in dataworld_clean.columns:
    ext_col_vals = set(dataworld_clean[col].unique())
    intersection_size = len(col_vals.intersection(ext_col_vals))
    union_size = len(col_vals.union(ext_col_vals))
    jaccard = intersection_size / union_size
    if jaccard != 0:
        print(col + ' has similarity score ' + str(jaccard))

url has similarity score 3.617028972402069e-05
website has similarity score 0.00026164311878597594
open_hours has similarity score 0.00012467273407305821


### Part B 

In [37]:
july4th = pd.read_csv('07-11-2020.csv')
thanksgiving = pd.read_csv('12-02-2020.csv')

In [38]:
july4th.shape

(58, 18)

In [39]:
thanksgiving.shape

(58, 18)

In [40]:
july4th.head(4)

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,UID,ISO3,Testing_Rate,Hospitalization_Rate
0,Alabama,US,2020-07-12 04:34:43,32.3182,-86.9023,52802,1114,25783.0,25050.0,1,1059.454212,502020.0,6745.0,2.144493,84000001,USA,10238.650999,12.984388
1,Alaska,US,2020-07-12 04:34:43,61.3707,-152.4044,1385,17,598.0,770.0,2,189.325332,143376.0,,1.227437,84000002,USA,19599.067727,
2,American Samoa,US,2020-07-12 04:34:43,-14.271,-170.132,0,0,,0.0,60,0.0,816.0,,,16,ASM,1466.544455,
3,Arizona,US,2020-07-12 04:34:43,33.7298,-111.4312,119930,2151,14394.0,103385.0,4,1647.680491,683990.0,5750.0,1.793546,84000004,USA,9397.123147,4.794463


In [41]:
thanksgiving.head(4)

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,Total_Test_Results,People_Hospitalized,Case_Fatality_Ratio,UID,ISO3,Testing_Rate,Hospitalization_Rate
0,Alabama,US,2020-12-03 05:30:41,32.3182,-86.9023,256828,3711,168387.0,84730.0,1.0,5237.983066,1603523.0,,1.444936,84000001.0,USA,32703.701778,
1,Alaska,US,2020-12-03 05:30:41,61.3707,-152.4044,33802,122,7165.0,25976.0,2.0,4546.951999,1024643.0,,0.366774,84000002.0,USA,140065.614555,
2,American Samoa,US,2020-12-03 05:30:41,-14.271,-170.132,0,0,,0.0,60.0,0.0,1988.0,,,16.0,ASM,3572.904872,
3,Arizona,US,2020-12-03 05:30:41,33.7298,-111.4312,340979,6739,53694.0,280546.0,4.0,4684.603069,2288204.0,,1.976368,84000004.0,USA,31436.913951,


There are no duplicates as the number of rows did not change between the drop_duplicates() function being applied to the two dataframes.

In [199]:
july4th.shape

(58, 18)

In [42]:
july4th_dropped = july4th.drop_duplicates()
july4th_dropped.shape

(58, 18)

In [200]:
thanksgiving.shape

(58, 18)

In [43]:
thanksgiving_dropped = thanksgiving.drop_duplicates()
thanksgiving_dropped.shape

(58, 18)

In [44]:
covid_cases = pd.read_csv('time_series_covid19_confirmed_US.csv')

In [201]:
covid_cases.shape

(3339, 330)

FlexMatcher Step

In [45]:
!pip install flexmatcher



In [46]:
import flexmatcher
schema_list = [july4th.astype(str), thanksgiving.astype(str)]
mapping_list = [dict(zip(july4th.columns, thanksgiving.columns))]
fm = flexmatcher.FlexMatcher(schema_list, mapping_list, sample_size = 500)
fm.train()

Create training data ...
Training FlexMatcher ...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Training Completed ...


In [47]:
predicted_mapping = fm.make_prediction(covid_cases.astype(str))
predicted_mapping

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  features = feat_df.ix[:, 1:].as_matrix()
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
  features = feat_df.ix[:, 1:].as_matrix()
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  features = feat_df.ix[:, 1:].as_matrix()
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.

{'-86.64408227': 'Long_',
 '0': 'FIPS',
 '1001': 'Active',
 '2018': 'Deaths',
 '2286': 'Last_Update',
 '2328.1': 'Case_Fatality_Ratio',
 '2351': 'People_Hospitalized',
 '2554': 'Recovered',
 '2661': 'Testing_Rate',
 '2873': 'Total_Test_Results',
 '2945': 'Confirmed',
 '32.53952745': 'Lat',
 '840': 'Incident_Rate',
 '84001001': 'UID',
 'Alabama': 'Province_State',
 'Autauga': 'Hospitalization_Rate',
 'US.1': 'Country_Region',
 'USA': 'ISO3'}