In [1]:
#Nathan Li - Cleaning Dog Data

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dogs-of-cambridge/Dogs_of_Cambridge.csv
/kaggle/input/dog-neighborhood-locations/Dog Neighbood Locations - Sheet1.csv


In [3]:
dog_data = pd.read_csv("/kaggle/input/dogs-of-cambridge/Dogs_of_Cambridge.csv")
dog_data.head()

Unnamed: 0,Dog_Name,Dog_Breed,Location_masked,Latitude_masked,Longitude_masked,Neighborhood
0,Ella,Golden Retriever,"(42.38138453, -71.10963321)",-71.109633,-71.109633,Agassiz
1,Nalah,Mix,"(42.37476908, -71.09690009)",-71.0969,-71.0969,Wellington-Harrington
2,Mic,Havanese,"(42.37560824, -71.15602601)",-71.156026,-71.156026,
3,Cash,Labrador Retriever,"(42.37200905, -71.0955469)",-71.095547,-71.095547,Wellington-Harrington
4,Barley,Labradoodle,"(42.37426564, -71.13756283)",-71.137563,-71.137563,West Cambridge


In [4]:
#Find missing data
for i in dog_data.columns:
    print (i + ": "+str(sum(dog_data[i].isnull()))+" missing values")

Dog_Name: 3 missing values
Dog_Breed: 32 missing values
Location_masked: 0 missing values
Latitude_masked: 0 missing values
Longitude_masked: 0 missing values
Neighborhood: 40 missing values


In [5]:
#Fixing Latitude_masked values with Location_masked
for i in range(len(dog_data)):
    splitLocation = dog_data.iloc[i].Location_masked.split(', ')
    latitude = str(splitLocation[0])[1:]
    dog_data.at[i, "Latitude_masked"] = latitude
for i in range(len(dog_data)):
    splitLocation = dog_data.iloc[i].Location_masked.split(', ')
    longitude = str(splitLocation[1])[0:-1]
    dog_data.at[i, "Longitude_masked"] = longitude

#Dropping Location_masked values now that Latitude_masked is fixed
dog_data = dog_data.drop("Location_masked", axis=1)
dog_data.head()

Unnamed: 0,Dog_Name,Dog_Breed,Latitude_masked,Longitude_masked,Neighborhood
0,Ella,Golden Retriever,42.381385,-71.109633,Agassiz
1,Nalah,Mix,42.374769,-71.0969,Wellington-Harrington
2,Mic,Havanese,42.375608,-71.156026,
3,Cash,Labrador Retriever,42.372009,-71.095547,Wellington-Harrington
4,Barley,Labradoodle,42.374266,-71.137563,West Cambridge


In [6]:
#Adding the neighborhood lat/long data so that missing Neighborhood values can be approximated
neighborhood_data = pd.read_csv("/kaggle/input/dog-neighborhood-locations/Dog Neighbood Locations - Sheet1.csv")
neighborhood_data.head()

Unnamed: 0,Name,Latitude,Longitude
0,East Cambridge,42.369204,-71.079015
1,Area 2/MIT,42.359145,-71.094415
2,Wellington-Harrington,42.371264,-71.092608
3,The Port,42.365604,-71.09691
4,Cambridgeport,42.3586,-71.109293


In [7]:
#Treating the lat/long as points on a cartesian plane, use simple distance formula d=√((x_2-x_1)²+(y_2-y_1)²)
distances = [0]*len(neighborhood_data)
for i in range(len(dog_data)):
    if pd.isna(dog_data.iloc[i].Neighborhood):
        for j in range(len(neighborhood_data)):
            x1 = dog_data.iloc[i].Latitude_masked
            x2 = neighborhood_data.iloc[j].Latitude
            y1 = dog_data.iloc[i].Longitude_masked
            y2 = neighborhood_data.iloc[j].Longitude
            dist = float(((x2-x1)**2 + (y2-y1)**2)**0.5)
            distances[j] = dist
        dog_data.at[i, "Neighborhood"] = neighborhood_data.iloc[distances.index(min(distances))].Name

#Checking if Shasta was correctly cleaned
dog_data.loc[dog_data.Dog_Name.isin(["Shasta"])]
#Hooray, it worked! Shasta the Shih Poo was successfully mapped to the Wellington-Harrington neighborhood.

Unnamed: 0,Dog_Name,Dog_Breed,Latitude_masked,Longitude_masked,Neighborhood
2387,Shasta,Shih Poo,42.374503,-71.09377,Wellington-Harrington


In [8]:
#Checking how many Dog_Breed missing in each neighborhood
missing = {}
for i, value in enumerate(dog_data.Neighborhood):
    if value in missing:
        if pd.isna(dog_data.iloc[i].Dog_Breed):
            missing[value] += 1
    else:
        missing[value] = 1
for key in missing:
    print (key + ": "+str(missing[key])+" missing Dog_Breed")

Agassiz: 2 missing Dog_Breed
Wellington-Harrington: 4 missing Dog_Breed
Strawberry Hill: 2 missing Dog_Breed
West Cambridge: 9 missing Dog_Breed
Neighborhood Nine: 3 missing Dog_Breed
North Cambridge: 4 missing Dog_Breed
Mid-Cambridge: 3 missing Dog_Breed
East Cambridge: 6 missing Dog_Breed
Cambridge Highlands: 2 missing Dog_Breed
Cambridgeport: 3 missing Dog_Breed
Riverside: 4 missing Dog_Breed
Area 2/MIT: 1 missing Dog_Breed
The Port: 2 missing Dog_Breed


In [9]:
#Partially order Dog_Breed frequency list with respect to Neighborhood
dog_data.loc[dog_data.Neighborhood == "Riverside"].Dog_Breed.value_counts()

Labrador Retriever        20
Shih Tzu                   9
Mix                        7
Golden Retriever           7
Labrador Retriever Mix     6
                          ..
Mini Golden Doodle         1
Gordon Setter              1
Maltipoo                   1
Husky / Mix                1
Yorkiepoo                  1
Name: Dog_Breed, Length: 108, dtype: int64

In [10]:
#Figuring out how to properly order to series of Dog_Breed frequencies with respect to Neighborhood
popularity = dog_data.loc[dog_data.Neighborhood == "East Cambridge"].Dog_Breed.value_counts()
popularity = popularity.iloc[np.lexsort([popularity.index])]
popularity = popularity.iloc[::-1]
popularity = popularity.iloc[np.lexsort([popularity.values])]
popularity = popularity.iloc[::-1]
print(popularity)

Labrador Retriever       18
Golden Retriever         17
Chihuahua                16
Maltese                  12
Siberian Husky           12
                         ..
Toy Terrier Mix           1
Weimainer                 1
Weimaraner Mix            1
York Terr                 1
Yorkshire Maltese Mix     1
Name: Dog_Breed, Length: 211, dtype: int64


In [11]:
#Filling in missing Dog_Breed based on popular breeds in each Neighborhood
for i in range(len(neighborhood_data)):
    popularity = dog_data.loc[dog_data.Neighborhood == neighborhood_data.iloc[i].Name].Dog_Breed.value_counts()
    popularity = popularity.iloc[np.lexsort([popularity.index])]
    popularity = popularity.iloc[::-1]
    popularity = popularity.iloc[np.lexsort([popularity.values])]
    popularity = popularity.iloc[::-1].index.tolist()
    key = 0
    for j in range(len(dog_data)):
        if pd.isna(dog_data.iloc[j].Dog_Breed) and dog_data.iloc[j].Neighborhood == neighborhood_data.iloc[i].Name:
            dog_data.at[j, "Dog_Breed"] = popularity[key]
            key += 1

In [12]:
#Filling in missing Dog_Name with "Poopsy"
dog_data.Dog_Name = dog_data.Dog_Name.fillna("Poopsy")

In [13]:
#Converting cleaned data to exportable csv file
output = dog_data
output.to_csv('Clean-Dogs-Nathan-Li.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
