# <center>Project for Foundations of Computer Science</center>
### <center>University of Milano-Bicocca</center>
<center>Matteo Corona - Costanza Pagnin</center>

### 0. Preliminary steps
### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import re

### Reading *.csv* files from GitHub Repository

In [2]:
travel=pd.read_csv('https://raw.githubusercontent.com/CoroTheBoss/CS-project/main/dogTravel.csv', index_col=0)
dog=pd.read_csv('https://raw.githubusercontent.com/CoroTheBoss/CS-project/main/dogs.csv')
nst=pd.read_csv('https://raw.githubusercontent.com/CoroTheBoss/CS-project/main/NST-EST2021-POP.csv')

### 1. Extract all dogs with status that is *not adoptable*

Some values were off by one column so they had to be properly shifted

In [3]:
# Shifting values
dog.loc[dog['status']!='adoptable','status':'accessed'] = dog.loc[dog['status']!='adoptable','status':'accessed'].shift(periods=1, axis="columns")

In [4]:
# Cheching all possible values in status
dog["status"].unique()

array(['adoptable', nan], dtype=object)

Since there are two different values, the NaN values refers to the *not adoptable* dogs

In [5]:
# Replacing NaN values
dog.loc[dog.status != 'adoptable', ['status']] = 'not adoptable'
# Printing the first not adoptable dogs to visualize the data
dog.loc[dog.status != 'adoptable', ['id', 'status']].head()

Unnamed: 0,id,status
644,41330726,not adoptable
5549,38169117,not adoptable
10888,45833989,not adoptable
11983,45515547,not adoptable
12495,45294115,not adoptable


In [6]:
print("There are", len(dog[dog.status != 'adoptable']) ,"dogs with status that is not adoptable" )

There are 33 dogs with status that is not adoptable


### 2. For each (primary) breed, determine the number of dogs

In [7]:
# Checking if all dogs have a primary key
dog[dog.breed_primary.isna()]

Unnamed: 0,id,org_id,url,type.x,species,breed_primary,breed_secondary,breed_mixed,breed_unknown,color_primary,...,contact_city,contact_state,contact_zip,contact_country,stateQ,accessed,type.y,description,stay_duration,stay_cost


In [151]:
# Checking if all dogs have an id
dog[dog.id.isna()]

Unnamed: 0,id,org_id,url,type.x,species,breed_primary,breed_secondary,breed_mixed,breed_unknown,color_primary,...,contact_city,contact_state,contact_zip,contact_country,stateQ,accessed,type.y,description,stay_duration,stay_cost


In [8]:
# Grouping dogs by their primary key and counting them
dog.groupby('breed_primary')['id'].count()

breed_primary
Affenpinscher                         17
Afghan Hound                           4
Airedale Terrier                      19
Akbash                                 3
Akita                                181
                                    ... 
Wirehaired Pointing Griffon            1
Wirehaired Terrier                    60
Xoloitzcuintli / Mexican Hairless     11
Yellow Labrador Retriever            158
Yorkshire Terrier                    360
Name: id, Length: 216, dtype: int64

### 3. For each (primary) breed, determine the ratio between the number of dogs of `Mixed Breed` and those not of Mixed Breed. Hint: look at the `secondary_breed`.

In [162]:
breed_tab = dog.groupby(['breed_primary','breed_mixed'])['id'].count()
breed_tab = breed_tab.unstack()
breed_tab.columns = ['not_mixed', 'mixed']
breed_tab[np.isnan(breed_tab)] = 0
breed_tab

Unnamed: 0_level_0,not_mixed,mixed
breed_primary,Unnamed: 1_level_1,Unnamed: 2_level_1
Affenpinscher,12.0,5.0
Afghan Hound,0.0,4.0
Airedale Terrier,2.0,17.0
Akbash,1.0,2.0
Akita,98.0,83.0
...,...,...
Wirehaired Pointing Griffon,0.0,1.0
Wirehaired Terrier,15.0,45.0
Xoloitzcuintli / Mexican Hairless,6.0,5.0
Yellow Labrador Retriever,36.0,122.0


In [163]:
breed_tab["mixed_%"] = round(100 * breed_tab["mixed"] / (breed_tab["mixed"] + breed_tab["not_mixed"]), 1)
breed_tab["not_mixed_%"] = round(100 * breed_tab["not_mixed"] / (breed_tab["mixed"] + breed_tab["not_mixed"]), 1)
breed_tab["ratio"] = round(breed_tab["mixed"] / breed_tab["not_mixed"], 2)
breed_tab

Unnamed: 0_level_0,not_mixed,mixed,mixed_%,not_mixed_%,ratio
breed_primary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Affenpinscher,12.0,5.0,29.4,70.6,0.42
Afghan Hound,0.0,4.0,100.0,0.0,inf
Airedale Terrier,2.0,17.0,89.5,10.5,8.50
Akbash,1.0,2.0,66.7,33.3,2.00
Akita,98.0,83.0,45.9,54.1,0.85
...,...,...,...,...,...
Wirehaired Pointing Griffon,0.0,1.0,100.0,0.0,inf
Wirehaired Terrier,15.0,45.0,75.0,25.0,3.00
Xoloitzcuintli / Mexican Hairless,6.0,5.0,45.5,54.5,0.83
Yellow Labrador Retriever,36.0,122.0,77.2,22.8,3.39


### 4. For each (primary) breed, determine the earliest and the latest `posted` timestamp.



In [245]:
dog.posted = pd.to_datetime(dog.posted)
time_tab = dog.groupby('breed_primary')[['posted']].min()
time_tab['postedmin'] = dog.groupby('breed_primary')[['posted']].max()
time_tab.columns = ['erliest_posted_timestamp', 'latest_posted_timestamp']
time_tab

Unnamed: 0_level_0,erliest_posted_timestamp,latest_posted_timestamp
breed_primary,Unnamed: 1_level_1,Unnamed: 2_level_1
Affenpinscher,2012-03-08 10:27:33+00:00,2019-09-14 10:10:51+00:00
Afghan Hound,2017-06-29 23:28:51+00:00,2019-07-27 00:38:48+00:00
Airedale Terrier,2014-06-13 12:59:36+00:00,2019-09-19 18:40:39+00:00
Akbash,2019-07-21 00:35:59+00:00,2019-08-23 17:11:04+00:00
Akita,2012-03-03 09:31:08+00:00,2019-09-20 15:19:57+00:00
...,...,...
Wirehaired Pointing Griffon,2016-06-29 20:03:55+00:00,2016-06-29 20:03:55+00:00
Wirehaired Terrier,2012-11-27 14:07:54+00:00,2019-09-19 22:52:45+00:00
Xoloitzcuintli / Mexican Hairless,2007-02-01 00:00:00+00:00,2019-09-08 11:15:54+00:00
Yellow Labrador Retriever,2010-05-31 00:00:00+00:00,2019-09-20 06:30:27+00:00


### 5. For each state, compute the sex imbalance, that is the difference between male and female dogs. In which state this imbalance is largest?

In [244]:
state_tab = dog.groupby(['contact_state','sex'])['id'].count()
state_tab = state_tab.unstack()
state_tab[np.isnan(state_tab)] = 0
state_tab["sex_imbalance"] = state_tab["Male"] - state_tab["Female"]
state_tab.head()

sex,Female,Male,Unknown,sex_imbalance
contact_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AK,7.0,8.0,0.0,1.0
AL,716.0,712.0,0.0,-4.0
AR,351.0,344.0,0.0,-7.0
AZ,1067.0,1181.0,1.0,114.0
CA,777.0,887.0,0.0,110.0


In [242]:
print("The state with the highest sex imbalance is Ohio.")
state_tab.loc[state_tab["sex_imbalance"] == state_tab["sex_imbalance"].max()]

The state with the highest sex imbalance is Ohio.


sex,Female,Male,Unknown,sex_imbalance
contact_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
OH,1234.0,1439.0,0.0,205.0


### 6. For each pair (age, size), determine the average duration of the stay and the average cost of stay.

In [274]:
round(dog.groupby(['age','size'], as_index=False)[['stay_duration','stay_cost']].mean(), 2)

Unnamed: 0,age,size,stay_duration,stay_cost
0,Adult,Extra Large,89.02,232.59
1,Adult,Large,89.53,238.66
2,Adult,Medium,89.42,238.26
3,Adult,Small,89.41,238.97
4,Baby,Extra Large,87.03,237.18
5,Baby,Large,89.7,238.7
6,Baby,Medium,89.58,237.11
7,Baby,Small,89.96,239.08
8,Senior,Extra Large,88.86,235.23
9,Senior,Large,88.98,237.51


### 7. Find the dogs involved in at least 3 travels. Also list the breed of those dogs.

### 8. Fix the `travels` table so that the correct state is computed from  the `manual` and the `found` fields. If `manual` is not missing, then it overrides what is stored in `found`.

### 9. For each state, compute the ratio between the number of travels and the population.

### 10. For each dog, compute the number of days from the `posted` day to the day of last access.

### 11. Partition the dogs according to the number of weeks from the `posted` day to the day of last access.

### 12. Find for duplicates in the `dogs` dataset. Two records are duplicates if they have (1) same breeds and sex, and (2) they share at least 90% of the words in the description field. Extra points if you find and implement a more refined for determining if two rows are duplicates.

In [14]:
pip install gingerit

SyntaxError: invalid syntax (<ipython-input-14-8aa4f9aeeca6>, line 1)

In [15]:
pip install ftfy

Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
 prova = dog.loc[12495].at["description"]

'â\x80¢Basset Hound, female, â\x80¢10 years \n\nDelightful Daisy is a friendly girl looking for a retirement home! Daisy is a spry 10 who greets people with a wagging tail and a hop so it is easy to pet her. She also enjoys walks, snuggling on the couch, and treats, not necessarily in that order. Daisy is a loved pet who will be missed, but she does not enjoy living with young children, and two have joined the family. Daisy is happy to leave them alone but the children are young and humans are not as easy to train as a dog is. Daisy does live with another dog but can be protective of her food, and may be happiest as an only dog, unless the family is prepared to manage the dogs. Daisy is much more about people than other dogs. Daisy has never lived with cats, but does have the hound part of Basset Hound in full, and likes to chase small fuzzy creatures in the yard, so we suspect it would not go well. She is open to meeting a cat though to see if our theory is correct. Daisy will be stay

In [21]:
from ftfy import fix_encoding
from ftfy import fix_text
prova = dog.loc[12495].at["description"]
fix_text(prova)

'•Basset Hound, female, •10 years \n\nDelightful Daisy is a friendly girl looking for a retirement home! Daisy is a spry 10 who greets people with a wagging tail and a hop so it is easy to pet her. She also enjoys walks, snuggling on the couch, and treats, not necessarily in that order. Daisy is a loved pet who will be missed, but she does not enjoy living with young children, and two have joined the family. Daisy is happy to leave them alone but the children are young and humans are not as easy to train as a dog is. Daisy does live with another dog but can be protective of her food, and may be happiest as an only dog, unless the family is prepared to manage the dogs. Daisy is much more about people than other dogs. Daisy has never lived with cats, but does have the hound part of Basset Hound in full, and likes to chase small fuzzy creatures in the yard, so we suspect it would not go well. She is open to meeting a cat though to see if our theory is correct. Daisy will be staying with h

In [4]:
pip install spacy

Collecting spacy
  Downloading spacy-3.4.3-cp38-cp38-win_amd64.whl (12.2 MB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.9-cp38-cp38-win_amd64.whl (18 kB)
Collecting typer<0.8.0,>=0.3.0
  Downloading typer-0.7.0-py3-none-any.whl (38 kB)
Collecting thinc<8.2.0,>=8.1.0
  Downloading thinc-8.1.5-cp38-cp38-win_amd64.whl (1.3 MB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.8-cp38-cp38-win_amd64.whl (96 kB)
Collecting wasabi<1.1.0,>=0.9.1
  Downloading wasabi-0.10.1-py3-none-any.whl (26 kB)
Collecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.5-cp38-cp38-win_amd64.whl (481 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4
  Downloading pydantic-1.10.2-cp38-cp38-win_amd64.whl (2.2 MB)
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.8-py3-none-any.whl (17 kB)
Collecting spacy-legacy<3.1.0,>=3.0.10
  Downloading spacy_legacy-3.0.10-py2.py3-none-any.

In [6]:
pip install contextualSpellCheck

Collecting contextualSpellCheck
  Downloading contextualSpellCheck-0.4.3-py3-none-any.whl (128 kB)
Collecting torch>=1.4
  Downloading torch-1.13.0-cp38-cp38-win_amd64.whl (167.3 MB)
Collecting transformers>=4.0.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
Collecting editdistance==0.6.0
  Downloading editdistance-0.6.0-cp38-cp38-win_amd64.whl (24 kB)
Note: you may need to restart the kernel to use updated packages.
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-win_amd64.whl (3.3 MB)
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
Installing collected packages: tokenizers, huggingface-hub, transformers, torch, editdistance, contextualSpellCheck
Successfully installed contextualSpellCheck-0.4.3 editdistance-0.6.0 huggingface-hub-0.11.1 tokenizers-0.13.2 torch-1.13.0 transformers-4.25.1


In [7]:
import spacy
import contextualSpellCheck

nlp = spacy.load('en_core_web_sm')
contextualSpellCheck.add_to_pipe(nlp)
doc = nlp('Income was $9.4 milion compared to the prior year of $2.7 milion.')

print(doc._.performed_spellCheck) #Should be True
print(doc._.outcome_spellCheck) #Income was $9.4 million compared to the prior year of $2.7 million.

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.