# <center>Project for Foundations of Computer Science</center>
### <center>University of Milano-Bicocca</center>
<center>Matteo Corona - Costanza Pagnin</center>

### 0. Preliminary steps
### Importing libraries

In [1]:
# Importing the necessary libraries 
from collections import Counter
from spacy.cli import download
from ftfy import fix_encoding
import pandas as pd
import numpy as np
import spacy
import ast
import re

### Reading *.csv* files from GitHub Repository

In [2]:
# Reading .csv files from GitHub Repository
nst    = pd.read_csv("https://raw.githubusercontent.com/CoroTheBoss/CS-project/main/NST-EST2021-POP.csv", header=None)
travel = pd.read_csv("https://raw.githubusercontent.com/CoroTheBoss/CS-project/main/dogTravel.csv", index_col=0)
dog    = pd.read_csv("https://raw.githubusercontent.com/CoroTheBoss/CS-project/main/dogs.csv")

### 1. Extract all dogs with status that is *not adoptable*

In [3]:
# The dataset is shifted
dog.loc[dog["status"] != "adoptable","status":"accessed"].head()

Unnamed: 0,status,posted,contact_city,contact_state,contact_zip,contact_country,stateQ,accessed
644,2018-04-05T05:18:31+0000,Las Vegas,NV,89146,US,89009,2019-09-20,
5549,2017-05-26T21:43:16+0000,Chandler,AZ,85249,US,AZ,2019-09-20,
10888,2019-09-01T15:12:06+0000,Albany,NY,12220,US,CT,2019-09-20,
11983,2019-08-06T12:15:58+0000,Albany,NY,12220,US,CT,2019-09-20,
12495,2019-07-18T14:20:58+0000,Albany,NY,12220,US,CT,2019-09-20,


In [4]:
# Shifting values (some values were off by one column)
dog.loc[dog["status"] != "adoptable",
        "status":"accessed"] = dog.loc[dog["status"] != "adoptable",
                                       "status":"accessed"].shift(periods = 1, axis = "columns")

In [5]:
# Cheching all possible values in status
dog["status"].unique()

array(['adoptable', nan], dtype=object)

In [6]:
# Replacing NaN values (the NaN values refers to the not adoptable dogs)
dog.loc[dog.status != "adoptable", ["status"]] = "not adoptable"
# Printing the first not adoptable dogs to visualize the data
dog.loc[dog.status != "adoptable", ["id", "status"]].head()

Unnamed: 0,id,status
644,41330726,not adoptable
5549,38169117,not adoptable
10888,45833989,not adoptable
11983,45515547,not adoptable
12495,45294115,not adoptable


In [7]:
print("There are", len(dog[dog.status != "adoptable"]) ,"dogs with status that is not adoptable" )

There are 33 dogs with status that is not adoptable


### 2. For each (primary) breed, determine the number of dogs

In [8]:
# Checking if all dogs have a primary breed
dog[dog['breed_primary'].isna()]

Unnamed: 0,id,org_id,url,type.x,species,breed_primary,breed_secondary,breed_mixed,breed_unknown,color_primary,...,contact_city,contact_state,contact_zip,contact_country,stateQ,accessed,type.y,description,stay_duration,stay_cost


In [9]:
# Checking if all dogs have an id
dog[dog['id'].isna()]

Unnamed: 0,id,org_id,url,type.x,species,breed_primary,breed_secondary,breed_mixed,breed_unknown,color_primary,...,contact_city,contact_state,contact_zip,contact_country,stateQ,accessed,type.y,description,stay_duration,stay_cost


In [10]:
# Grouping id by their primary_breed and counting them
dog.groupby("breed_primary")["id"].count()

breed_primary
Affenpinscher                         17
Afghan Hound                           4
Airedale Terrier                      19
Akbash                                 3
Akita                                181
                                    ... 
Wirehaired Pointing Griffon            1
Wirehaired Terrier                    60
Xoloitzcuintli / Mexican Hairless     11
Yellow Labrador Retriever            158
Yorkshire Terrier                    360
Name: id, Length: 216, dtype: int64

### 3. For each (primary) breed, determine the ratio between the number of dogs of `Mixed Breed` and those not of Mixed Breed. Hint: look at the `secondary_breed`.

In [11]:
# In some instances, breed secondary is NaN but "mixed breed" is True
dog.loc[:,"breed_primary":"breed_mixed"].head(10)

Unnamed: 0,breed_primary,breed_secondary,breed_mixed
0,American Staffordshire Terrier,Mixed Breed,True
1,Pit Bull Terrier,Mixed Breed,True
2,Shepherd,,False
3,German Shepherd Dog,,False
4,Dachshund,,False
5,Boxer,Beagle,True
6,Italian Greyhound,Chihuahua,True
7,Cattle Dog,,True
8,Cattle Dog,,True
9,Cattle Dog,,True


In [12]:
# Correcting the "mixed breed" column
dog.loc[dog["breed_secondary"] == "Mixed Breed", "breed_mixed"] = True
dog.loc[dog["breed_secondary"].isnull(), "breed_mixed"] = False
dog.loc[(dog["breed_secondary"] != "Mixed Breed") &
        (dog["breed_secondary"].isnull() == False) &
        (dog["breed_secondary"] == dog["breed_primary"]), "breed_mixed"] = False
dog.loc[(dog["breed_secondary"] != "Mixed Breed") &
        (dog["breed_secondary"].isnull() == False) &
        (dog["breed_secondary"] != dog["breed_primary"]), "breed_mixed"] = True

In [13]:
# Grouping dogs basing on the breed_primary and breed_mixed column
breed_tab = dog.groupby(["breed_primary","breed_mixed"])["id"].count()
# Unstacking the data
breed_tab = breed_tab.unstack()
# Renaming columns
breed_tab.columns = ["not_mixed", "mixed"]
# Filling NaN values with 0
breed_tab[np.isnan(breed_tab)] = 0
# Computing the ratio between mixed and not mixed
breed_tab["ratio"] = round(breed_tab["mixed"] / breed_tab["not_mixed"], 2)
# Printing the data
breed_tab

Unnamed: 0_level_0,not_mixed,mixed,ratio
breed_primary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Affenpinscher,15.0,2.0,0.13
Afghan Hound,3.0,1.0,0.33
Airedale Terrier,10.0,9.0,0.90
Akbash,3.0,0.0,0.00
Akita,129.0,52.0,0.40
...,...,...,...
Wirehaired Pointing Griffon,1.0,0.0,0.00
Wirehaired Terrier,42.0,18.0,0.43
Xoloitzcuintli / Mexican Hairless,7.0,4.0,0.57
Yellow Labrador Retriever,96.0,62.0,0.65


### 4. For each (primary) breed, determine the earliest and the latest `posted` timestamp.



In [14]:
# Converting "posted" column for manipulating dates and times
dog.posted = pd.to_datetime(dog.posted)
# Grouping "posted" by the primary_breed and finding erliest and lates posted time for each group
time_tab = dog.groupby("breed_primary")[["posted"]].min()
time_tab["postedmax"] = dog.groupby("breed_primary")[["posted"]].max()
# Renaming columns and printing dataframe
time_tab.columns = ["earliest_posted_timestamp","latest_posted_timestamp"]
time_tab

Unnamed: 0_level_0,earliest_posted_timestamp,latest_posted_timestamp
breed_primary,Unnamed: 1_level_1,Unnamed: 2_level_1
Affenpinscher,2012-03-08 10:27:33+00:00,2019-09-14 10:10:51+00:00
Afghan Hound,2017-06-29 23:28:51+00:00,2019-07-27 00:38:48+00:00
Airedale Terrier,2014-06-13 12:59:36+00:00,2019-09-19 18:40:39+00:00
Akbash,2019-07-21 00:35:59+00:00,2019-08-23 17:11:04+00:00
Akita,2012-03-03 09:31:08+00:00,2019-09-20 15:19:57+00:00
...,...,...
Wirehaired Pointing Griffon,2016-06-29 20:03:55+00:00,2016-06-29 20:03:55+00:00
Wirehaired Terrier,2012-11-27 14:07:54+00:00,2019-09-19 22:52:45+00:00
Xoloitzcuintli / Mexican Hairless,2007-02-01 00:00:00+00:00,2019-09-08 11:15:54+00:00
Yellow Labrador Retriever,2010-05-31 00:00:00+00:00,2019-09-20 06:30:27+00:00


### 5. For each state, compute the sex imbalance, that is the difference between male and female dogs. In which state this imbalance is largest?

In [15]:
# Grouping id by their contact_state and sex and counting them
state_tab = dog.groupby(["contact_state","sex"])["id"].count()
# Unstacking table
state_tab = state_tab.unstack()
# Setting the NaN values to zero 
state_tab[np.isnan(state_tab)] = 0
# Computing the sex imbalance and then printing the dataframe
state_tab["sex_imbalance"] = state_tab["Male"] - state_tab["Female"]
state_tab.head()

sex,Female,Male,Unknown,sex_imbalance
contact_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AK,7.0,8.0,0.0,1.0
AL,716.0,712.0,0.0,-4.0
AR,351.0,344.0,0.0,-7.0
AZ,1067.0,1181.0,1.0,114.0
CA,777.0,887.0,0.0,110.0


In [16]:
# Printing the state with the highest sex imbalance
state_tab.loc[state_tab["sex_imbalance"] == state_tab["sex_imbalance"].max()]
print("The state with the highest sex imbalance is Ohio.")

The state with the highest sex imbalance is Ohio.


### 6. For each pair (age, size), determine the average duration of the stay and the average cost of stay.

In [17]:
# Grouping dogs by their age and size and averaging the stay_duration and stay_cost values for each group
round(dog.groupby(["age","size"])[["stay_duration","stay_cost"]].mean(), 2)

Unnamed: 0_level_0,Unnamed: 1_level_0,stay_duration,stay_cost
age,size,Unnamed: 2_level_1,Unnamed: 3_level_1
Adult,Extra Large,89.02,232.59
Adult,Large,89.53,238.66
Adult,Medium,89.42,238.26
Adult,Small,89.41,238.97
Baby,Extra Large,87.03,237.18
Baby,Large,89.7,238.7
Baby,Medium,89.58,237.11
Baby,Small,89.96,239.08
Senior,Extra Large,88.86,235.23
Senior,Large,88.98,237.51


### 7. Find the dogs involved in at least 3 travels. Also list the breed of those dogs.

In [18]:
# Grouping "contact_city" by the dogs id and counting them
travel_tab = travel.groupby(["id"], as_index=False)[["contact_city"]].count()
# Renaming columns
travel_tab.columns = ["id", "count"]
# Excluding all dogs that do not match the given condition
travel_tab = travel_tab[travel_tab["count"] > 2]
# Merging the dataframe with "dog" in order to show the breed
pd.merge(travel_tab,dog[["id", "breed_primary"]],
         how = "left", on = ["id"])

Unnamed: 0,id,count,breed_primary
0,16657005,4,Pit Bull Terrier
1,20905974,5,Chow Chow
2,24894870,4,Hound
3,24894894,4,Hound
4,33218331,7,Alaskan Malamute
...,...,...,...
558,46042569,3,Labrador Retriever
559,46042587,3,Labrador Retriever
560,46042618,3,Labrador Retriever
561,46043099,3,Labrador Retriever


In [19]:
# Printing the number of dogs involved in at least three travels
print("There are", len(travel_tab) ,"dogs involved in at least three travels" )

There are 563 dogs involved in at least three travels


### 8. Fix the `travels` table so that the correct state is computed from  the `manual` and the `found` fields. If `manual` is not missing, then it overrides what is stored in `found`.

In [20]:
# Asking if the element in the "manual" column is not null and then replacing the found column
travel.loc[travel["manual"].notnull(), "found"] = travel.loc[travel["manual"].notnull(), "manual"]
# Printing the fixed dataframe
travel[["id","found","manual"]]

Unnamed: 0_level_0,id,found,manual
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,44520267,Arkansas,
1,44698509,Bahamas,Bahamas
2,45983838,Maryland,Maryland
3,44475904,Adaptil,
4,43877389,Afghanistan,
...,...,...,...
6189,40492179,WV,
6190,45799729,Wyoming,
6191,34276515,Yazmin,
6192,44519341,Ohio,Ohio


### 9. For each state, compute the ratio between the number of travels and the population.

In [21]:
# Fixing a value in travels: 17325 refers to Pennsylvania (PA)
travel["contact_state"] = travel["contact_state"].replace('17325','PA')
# Opening and reading the file (found on the internet) which contains the state abbreviations
file = open("abbreviations.txt", "r")
contents = file.read()
# Converting the file into a dictionary
abb = ast.literal_eval(contents)
file.close()
# Naming nst columns (the file was without header)
nst.columns = ["state", "population"]
# Substituting state names with abbreviations
nst = nst.replace({"state": abb})
# Converting population column into numeric values
nst["population"] = nst["population"].str.replace('.', '', regex=True)
nst.head()

Unnamed: 0,state,population
0,AL,5024279
1,AK,733391
2,AZ,7151502
3,AR,3011524
4,CA,39538223


In [22]:
# Grouping id by the contact_state (travel count per state)
state_tab_ratio = travel.groupby(["contact_state"], as_index=False)["id"].count()
# Renaming columns
state_tab_ratio.columns = ["state", "travels"]
# Merging the new dataframe with the nst dataframe in otder to show the state populations
state_tab_ratio = pd.merge(nst,state_tab_ratio[["state","travels"]],
         how = "left", on = ["state"])
# Setting the NaN values to zero 
state_tab_ratio = state_tab_ratio.fillna(0)
# Fixing "population" type and setting it as float
state_tab_ratio["population"] = state_tab_ratio["population"].astype(float)
# Computing the ratio between travel and population and printing the dataframe
state_tab_ratio["ratio"] = state_tab_ratio["travels"] / state_tab_ratio["population"]
state_tab_ratio.head()

Unnamed: 0,state,population,travels,ratio
0,AL,5024279.0,75.0,1.492751e-05
1,AK,733391.0,0.0,0.0
2,AZ,7151502.0,70.0,9.788154e-06
3,AR,3011524.0,10.0,3.320578e-06
4,CA,39538223.0,28.0,7.081755e-07


### 10. For each dog, compute the number of days from the `posted` day to the day of last access.

In [23]:
# Ignoring the SettingWithCopyWarning
import warnings
warnings.filterwarnings('ignore')
# Selecting the needed columns and saving them in a new dataframe for convenience
days_tab = dog[["id","posted", "accessed"]]
# Converting "posted" and "accessed" columns for manipulating dates and times
days_tab["accessed"] = pd.to_datetime(days_tab.accessed).dt.date
days_tab["posted"]   = pd.to_datetime(days_tab.posted).dt.date
# Computing their difference and saving the result in a new column
days_tab["days"] = days_tab["accessed"] - days_tab["posted"]
# Printing the dataframe 
days_tab

Unnamed: 0,id,posted,accessed,days
0,46042150,2019-09-20,2019-09-20,0 days
1,46042002,2019-09-20,2019-09-20,0 days
2,46040898,2019-09-20,2019-09-20,0 days
3,46039877,2019-09-20,2019-09-20,0 days
4,46039306,2019-09-20,2019-09-20,0 days
...,...,...,...,...
58175,44605893,2019-05-03,2019-09-20,140 days
58176,44457061,2019-04-13,2019-09-20,160 days
58177,42865848,2018-09-27,2019-09-20,358 days
58178,42734734,2018-09-12,2019-09-20,373 days


### 11. Partition the dogs according to the number of weeks from the `posted` day to the day of last access.

In [24]:
# Selecting the needed columns and saving them in a new dataframe for convenience
weeks_tab = dog[["id","posted", "accessed"]]
# Converting "posted" and "accessed" columns for manipulating dates and times
weeks_tab.accessed = pd.to_datetime(weeks_tab.accessed).dt.date
weeks_tab.posted   = pd.to_datetime(weeks_tab.posted).dt.date
# Computing the number of weeks from the posted day to the day of last access
weeks_tab["weeks"] = round((weeks_tab["accessed"] - weeks_tab["posted"])/np.timedelta64(1,'W'),0)
# Grouping dogs according to the number of weeks
weeks_tab = weeks_tab.groupby(["weeks"])[['id']].agg(lambda x: list(x))
# Printing the dataframe
weeks_tab

Unnamed: 0_level_0,id
weeks,Unnamed: 1_level_1
0.0,"[46042150, 46042002, 46040898, 46039877, 46039..."
1.0,"[45989641, 45988823, 45988816, 45988814, 45987..."
2.0,"[45919405, 45917309, 45917305, 45917298, 45911..."
3.0,"[45841113, 45841108, 45841101, 45841088, 45841..."
4.0,"[45751169, 45748689, 45748573, 45748545, 45748..."
...,...
730.0,[5142790]
747.0,[4527948]
812.0,[2613506]
813.0,[2592031]


### 12. Find for duplicates in the `dogs` dataset. Two records are duplicates if they have (1) same breeds and sex, and (2) they share at least 90% of the words in the description field. Extra points if you find and implement a more refined method for determining if two rows are duplicates.

In [25]:
# Selecting the needed column and dropping values with NaN description (they can't be compared) 
dog_duplicates = dog[['breed_primary','sex',"id",'description']]
dog_duplicates = dog_duplicates[pd.notnull(dog_duplicates["description"])]
# Cleaning descriptions: replacing the unwanted "/n" character with a space
dog_duplicates["description"] = dog_duplicates["description"].apply(lambda x: x.replace("\n", " "))
# Cleaning descriptions: using fix_encoding() function (utf-8, codifica caratteri)
dog_duplicates["description"] = dog_duplicates["description"].apply(lambda x: fix_encoding(x))
# Grouping dogs according to breed and sex in order to delete those that are unique (they can't have duplicates)
dog_duplicates = dog_duplicates.groupby(["breed_primary", "sex"])[["id", "description"]].agg(lambda x: list(x))
dog_duplicates = dog_duplicates[dog_duplicates['id'].map(len)!=1]
# Restoring the dataframe index
dog_duplicates = dog_duplicates.reset_index()
dog_duplicates

Unnamed: 0,breed_primary,sex,id,description
0,Affenpinscher,Female,"[45889013, 22427951, 45970614, 45871731, 45916...",[This cutie is very sweet. She is a little shy...
1,Affenpinscher,Male,"[38985146, 45728674, 45787432, 45858286, 45362...",[Ralphie is a darling black Affenpinscher mix ...
2,Afghan Hound,Male,"[45382284, 42476375, 39728532]",[We do not know what breed Bear is. He resembl...
3,Airedale Terrier,Female,"[45682240, 45811667, 45295124, 43692266, 46007...",[Meet Cher! She is a very sweet puppy needing ...
4,Airedale Terrier,Male,"[44752626, 45682439, 45565329, 29481512, 45308...","[Ehu is a cool-looking dog with a happy, posit..."
...,...,...,...,...
353,Xoloitzcuintli / Mexican Hairless,Male,"[44118935, 43283678, 43248772, 45905447, 44869...",[Vlad is one of our sanctuary dogs. He is a M...
354,Yellow Labrador Retriever,Female,"[43828473, 39983699, 28672284, 44927114, 46006...","[We've had Goldie for almost 5 years, and she ..."
355,Yellow Labrador Retriever,Male,"[46031975, 45987630, 45592988, 45288980, 39528...",[Ranger is a sweet boy full of puppy energy an...
356,Yorkshire Terrier,Female,"[45908725, 45272327, 44618549, 45310965, 45609...",[Meet Lucy! Lucy is a 10 month old Carin Terr...


In [26]:
# Importing the natural language processing (nlp) needed packages
nlp = spacy.load("en_core_web_lg")
regex = re.compile(r'[a-z]')
# Defining a function that splits each description contained in a row into a list of words
def makelist(row):
    # Creating blank output
    output = []
    for description in row:
        # Applying nlp to the current element and saving result in a temporary variable
        doc = nlp(description)
        # Creating a blank list for containing the list of word
        description_list = list()
        # For loop that iterates on each word contained in the current element of the row
        for token in doc:
            if regex.search(token.text.lower()): # Condition for a string to be a word
                # Append word in the descript_list
                description_list.append(token.text.lower())
        # Appending the new list of words in the final output list
        output.append(description_list)
    return(output)

In [27]:
# Applying the makelist() function to all elements in the description column
dog_duplicates['listed'] = dog_duplicates['description'].apply(lambda row: makelist(row))
# Printing the dataframe in order to visualize the data
dog_duplicates

Unnamed: 0,breed_primary,sex,id,description,listed
0,Affenpinscher,Female,"[45889013, 22427951, 45970614, 45871731, 45916...",[This cutie is very sweet. She is a little shy...,"[[this, cutie, is, very, sweet, she, is, a, li..."
1,Affenpinscher,Male,"[38985146, 45728674, 45787432, 45858286, 45362...",[Ralphie is a darling black Affenpinscher mix ...,"[[ralphie, is, a, darling, black, affenpinsche..."
2,Afghan Hound,Male,"[45382284, 42476375, 39728532]",[We do not know what breed Bear is. He resembl...,"[[we, do, not, know, what, breed, bear, is, he..."
3,Airedale Terrier,Female,"[45682240, 45811667, 45295124, 43692266, 46007...",[Meet Cher! She is a very sweet puppy needing ...,"[[meet, cher, she, is, a, very, sweet, puppy, ..."
4,Airedale Terrier,Male,"[44752626, 45682439, 45565329, 29481512, 45308...","[Ehu is a cool-looking dog with a happy, posit...","[[ehu, is, a, cool, looking, dog, with, a, hap..."
...,...,...,...,...,...
353,Xoloitzcuintli / Mexican Hairless,Male,"[44118935, 43283678, 43248772, 45905447, 44869...",[Vlad is one of our sanctuary dogs. He is a M...,"[[vlad, is, one, of, our, sanctuary, dogs, he,..."
354,Yellow Labrador Retriever,Female,"[43828473, 39983699, 28672284, 44927114, 46006...","[We've had Goldie for almost 5 years, and she ...","[[we, 've, had, goldie, for, almost, years, an..."
355,Yellow Labrador Retriever,Male,"[46031975, 45987630, 45592988, 45288980, 39528...",[Ranger is a sweet boy full of puppy energy an...,"[[ranger, is, a, sweet, boy, full, of, puppy, ..."
356,Yorkshire Terrier,Female,"[45908725, 45272327, 44618549, 45310965, 45609...",[Meet Lucy! Lucy is a 10 month old Carin Terr...,"[[meet, lucy, lucy, is, a, month, old, carin, ..."


In [28]:
# Defining a function that compares two description and compute the percentage of shared words
def comparison(x,y):
    # Counting how many times each word appears in a description wiht the "Counter()" function
    a = Counter(x)
    b = Counter(y)
    # Defining a blank list for the final output
    res = []
    # For loop that iterates on each common element of the two listx and y
    # set() lists all the unique elements contained in a list
    # The ".intersection()" method return only the common element between two list
    for i in set(x).intersection(set(y)):
        # Extendind the "res" list basing on how many time each common word appears in the the two lists
        # (we choose the minumun number of appearence in the two lists)
        res.extend([i] * min(b[i], a[i]))
    # Computing denominator as the length of the longest description
    denominator = max(len(x), len(y))
    # Computing the output as the ratio common / (common + uncommon) and rounding the value for convenience
    return (round( (len(res) / denominator) * 100,1) )

In [32]:
# Searching for blank list of words and popping them (with their corresponding ids and descriptions) from the dataframe
# Iterating in each element of the "listed" column
for row in range(len(dog_duplicates.index)-1):
    # Iterating in each element of a single "listed" element
    for i in range(len(dog_duplicates['listed'][row])-1):
        if len(dog_duplicates['listed'][row][i]) == 0: # Condition for an element to be a blank list
            # Calling the .pop() method in order to delete the blank lists and their corresponding ids and descriptions
            dog_duplicates["id"][row].pop(i)
            dog_duplicates["description"][row].pop(i)
            dog_duplicates["listed"][row].pop(i)

In [33]:
# Creating blank lists for storing the id which will fill the dataframe
first  = []
second = []
# For loop tha iterates on each element in the "listed" column
for row in range(len(dog_duplicates.index)-1):
    current = dog_duplicates['listed'][row]
    # For loop that gets combinations
    for i in range(len(current)-1):
        for j in range(i+1, len(current)-1):
            if comparison(current[i], current[j]) >= 90.0:
                first.append(dog_duplicates['id'][row][i])
                second.append(dog_duplicates['id'][row][j])               
# Creating dataframe of containing the duplicates couples of ids and naming the columns 
all_duplicates = pd.DataFrame(list(zip(first, second)),
               columns =['id', 'duplicates'])

In [34]:
# Printing the result
all_duplicates

Unnamed: 0,id,duplicates
0,45970614,45871731
1,45842497,45842477
2,46023964,46023963
3,46020162,46019974
4,46020162,46019762
...,...,...
7866,45451700,45604974
7867,45451700,45451588
7868,45360286,45360271
7869,44404139,44404138


### Another strategy based on the dog names

In [35]:
# Dog names sample (many of the names contain additional information that should be excluded or unwanted sub-strings)
dog["name"][5259:5273]

5259    Magnolia (part of bonded pair with Oliver)
5260    Oliver (part of bonded pair with Magnolia)
5261                                       BROWNIE
5262                                   Pumpkin Pie
5263                                     Sweet Pea
5264                                       Scooter
5265                                         Lilly
5266                                       TANGO 2
5267                                        LAYLAH
5268                                          Ruby
5269                                         Raven
5270                         Sandy - Courtesy Post
5271                      SAOIRSE-Courtesy Listing
5272                       FERGUS-Courtesy Listing
Name: name, dtype: object

In [36]:
# Importing the "word_tokenize" function (compute list of tokens)
from nltk.tokenize import word_tokenize
# Remove isolated characters function
def remove_isolated_char_func(text):
    words = word_tokenize(text)
    text = ' '.join([word for word in words if len(word) > 2])
    return text
# Remove function (removes what's before or after a certain word)
def remove_func(text, word, before_or_after):
    if before_or_after == "before": index = 1
    if before_or_after == "after":  index = 0
    if word in text:
        text = text.split(word)[index]
    return text
# Remove "and" function (deals with the dog couples)
def make_consistent_couples(text):
    words = word_tokenize(text)
    if "And" in words:
        temp = text.split("And")
        if len(temp[0])==0 or len(temp[1])==0:
            text = "".join(temp)
        else:
            text = temp.sort()
            text = "-".join(temp)
    return text  

In [37]:
# Selecting the needed column and dropping values with NaN values (they can't be compared) 
dog_names = dog[['breed_primary','color_primary','sex',"id",'name']].dropna(subset=['name'])
dog_names = dog_names[pd.notnull(dog_names["name"])]
# Replacing the & character with "and" for the couples
dog_names["name"] = dog_names["name"].apply(lambda x: x.replace("&"," and "))
# Calling the "remove_func" in order to remove unwanted substrings in the names
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, ":", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "-", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "_", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "~", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "#", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "|", "after")) 
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, ",", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "..", "after"))
# Removing all sub-strings cointained in (), <>, ** or //" using the "re.sub()" function
dog_names["name"] = dog_names["name"].apply(lambda x: re.sub("[\(\[].*?[\)\]]", "", x))
dog_names["name"] = dog_names["name"].apply(lambda x: re.sub('<[^>]+>', "", x))
dog_names["name"] = dog_names["name"].apply(lambda x: re.sub('\*[^*]+\*', "", x))
# Fixing Capital Letters
dog_names["name"] = dog_names["name"].apply(lambda x: x.title())
# Removing unwanted characters (just selecting the alphabetic chars)
alpha_regex = re.compile('[^a-zA-Z]')
dog_names["name"] = dog_names["name"].apply(lambda x: alpha_regex.sub(" ",x))

In [38]:
# Dealing with other unwanted sub-strings
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Available", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Adoption", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Courtesy", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Loves", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Wears", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Month", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Avail", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Meet", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Need", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Year", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Aka", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Fka", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "The", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Was", "before"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Lbs", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Great", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Foster", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Bonded", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Fee", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "From", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Qualifies", "after"))
dog_names["name"] = dog_names["name"].apply(lambda x: remove_func(x, "Looking", "after"))

In [39]:
# Creating a list of unwanted words that must be deleted
to_remove = ["Fluffy","Face","Your","Homework","Before","Northern","Amp","You","Adopt","Forever","Senior","Dog","Fantastic","Oks","Pure",
             "This","One","Over","Absolutely","Wonderful","Smoosh","Best","Smoosh","Face","What","Character","Ever","Updated","Photos",
             "Beauregarde","Urgent","Strong","Girl","Lonestar","Agility","Star","Very","Smart","Boy","Read","His","Info",
             "Survivor","Charming","Crue","Senior","For","Senior","Program","Parton","Society","Wiggs","Sponsored",
             "Maria","Society","Bradford","Society","With","Some","Male","Dogs","Dog","Selective","Senior","Would","Love","For","Walks","Has","Been",
             "Such","Spotsylvania","Shelter","Remington","Her","Notes","Girl","Handsome","Get","Along","Clown","Pair","Silly",
             "Southern","Chi","Tuesday","Jewel","Chis","Westport","Animal","Control","Loving","Best","Friend","Handsome","Local","Young","Boy","Husky",
             "New","England","Ready","Sat","Sweetheart","Companion","Good","With","Kids","Loving","Memory","Our","Sweet","Pretty","Boy",
             "Pre","known","Akita","Stray","Web","Apple","Face","Darling","What","Post","Cuddle","Hug","Pie","Sweetie","Adopted","Confident",
             "Smart","Loving","Retired","Racing","Super","Adopted","Adoption","Sponsored","Pending","Sweetheart","Bonded","Pair","More",
             "Old","Man","Facts","Life","Home","Foster","Fee","Low","Lower","Needed","Sponsor","Courtesy","Listing","Cat","Friendly"]

In [40]:
# Removing final unwanted sub-strings (taking just words that are not contained in the "to_remove" list)
dog_names["name"] = dog_names["name"].apply(lambda x: " ".join(word for word in x.split() if word not in to_remove))
# Dealing with dog couples
dog_names["name"] = dog_names["name"].apply(lambda x: make_consistent_couples(x))
# Removing isolated character
dog_names["name"] = dog_names["name"].apply(lambda x: remove_isolated_char_func(x))
# Removing unwanted spaces
dog_names["name"] = dog_names["name"].apply(lambda x: " ".join(x.strip().split()))
# Removing blank names
dog_names["name"].replace('', np.nan, inplace=True)
dog_names.dropna(subset = ["name"], inplace=True)

In [41]:
# Grouping dogs according to breed, sex and name (deleting those that are unique: they can't have duplicates)
dog_names = dog_names.groupby(["breed_primary","color_primary","sex","name"])[["id"]].agg(lambda x: list(x))
dog_names = dog_names[dog_names['id'].map(len)!=1]
# Restoring the dataframe index
dog_names = dog_names.reset_index()
# Printing the result
dog_names

Unnamed: 0,breed_primary,color_primary,sex,name,id
0,Akita,Black,Male,Niko,"[45935207, 42281721]"
1,Akita,Black,Male,Titan,"[45842497, 45842477]"
2,American Bulldog,Black,Male,Duke,"[45714263, 45184169]"
3,American Bulldog,Brindle,Female,Lulu,"[45117914, 42892855]"
4,American Bulldog,Brindle,Male,Bentley,"[38532212, 45288944]"
...,...,...,...,...,...
1405,Yorkshire Terrier,Gray / Blue / Silver,Female,Ruby,"[45604886, 45604974]"
1406,Yorkshire Terrier,Gray / Blue / Silver,Female,Snagglepuss,"[45451700, 45451588]"
1407,Yorkshire Terrier,Gray / Blue / Silver,Male,Dino,"[45451786, 45451610]"
1408,Yorkshire Terrier,Gray / Blue / Silver,Male,Tank,"[45572590, 45572522]"


In [42]:
# Check_1
list(dog[dog["id"] == 45572590]["description"])

['YOU MUST APPLY ON OUR WEBSITE\nhttp://www.yorkierescueme.com\nBEFORE CONTACTING MY FOSTER HOME!\n\nREAD ALL ABOUT ME BELOW!\n\n6 years old, 4.8 pounds\nTank came from a puppy mill and has been through so much in his life.  He has come a long way but still has a long way to go.\n\nTank MUST be part of a multi-pack of dogs for the rest of his life as he responds best when being able to follow other dogs.  A healthy pack in his new home gives him the ability to learn from their behavior.  Tank must be in a quiet ADULT ONLY home as that will make him most comfortable.  He needs someone with lots of patience and puppy mill dog experience.  He will not be the dog who comes running to greet you at the door but with love and patience he might just get there some day.  \n\nTank is fostered in Boyertown, PA and his adoption fee is $400.\n__________________________________________________________________________________________\n\nWatch the Rescue Me video @ http://www.youtube.com/watch?v=XM8vy

In [43]:
# Check_2
list(dog[dog["id"] == 45572522]["description"])

['YOU MUST APPLY ON OUR WEBSITE\nhttp://www.yorkierescueme.com\nBEFORE CONTACTING MY FOSTER HOME!\n\nREAD ALL ABOUT ME BELOW!\n\n6 years old, 4.8 pounds\nTank came from a puppy mill and has been through so much in his life.  He has come a long way but still has a long way to go.\n\nTank MUST be part of a multi-pack of dogs for the rest of his life as he responds best when being able to follow other dogs.  A healthy pack in his new home gives him the ability to learn from their behavior.  Tank must be in a quiet ADULT ONLY home as that will make him most comfortable.  He needs someone with lots of patience and puppy mill dog experience.  He will not be the dog who comes running to greet you at the door but with love and patience he might just get there some day.  \n\nTank is fostered in Boyertown, PA and his adoption fee is $400.\n__________________________________________________________________________________________\n\nWatch the Rescue Me video @ http://www.youtube.com/watch?v=XM8vy