You have to work on the [Dogs adoptions](https://drive.google.com/file/d/1wQsA0oB6wwYlnkvvcyBCmLk7QmgVWNax/view?usp=sharing) dataset. 

It contains three files:
*  `dogs.csv`, shortly *dogs*
*  `dogTravel.csv`, shortly *travels*
*  `NST-EST2021-POP.csv`

### Notes

1.    It is mandatory to use GitHub for developing the project.
1.    The project must be a jupyter notebook.
1.    There is no restriction on the libraries that can be used, nor on the Python version.
1.    All questions on the project **must** be asked in a public channel on [Zulip](https://focs.zulipchat.com).
1.    At most 3 students can be in each group. You must create the groups by yourself.
1.    You do not have to send me the project *before* the discussion.

### 0.1 Importing files

In [None]:
# Importing Pandas
import pandas as pd

# Opening dogs.csv and checking columns
with open("dogs.csv", "r") as dogs_file:
    headers = dogs_file.readline()
    print(headers)

In [None]:
# Creating 'dogs' df 
dogs = pd.read_csv("dogs.csv", sep=',', doublequote='"', low_memory=False)

# Checking the head
dogs.head()

### 0.2 Cleaning up

In [None]:
tmp_dog_full = pd.read_csv("dogs.csv", sep=',', doublequote='"', low_memory=False, encoding='utf-8')
print(f'tmp_dog_full shape: {tmp_dog_full.shape}')

# check what lines are ok and what need to be managed in different way: use contact state as watermark
tmp_dog_full['ok'] = ~tmp_dog_full.contact_state.str.isnumeric()
tmp_dog_full.columns = [col.lower().replace(".", "_") for col in tmp_dog_full.columns]

# split dataframe with different case
tmp_dog_ok = tmp_dog_full[tmp_dog_full.ok == True]
tmp_dog_not_ok = tmp_dog_full[tmp_dog_full.ok == False]
print('tmp_dog_ok:')
display(tmp_dog_ok.head(5))
print('##################################')
print('tmp_dog_not_ok')
display(tmp_dog_not_ok.head(5))

# check all rows are ok
print(len(tmp_dog_ok.contact_state.unique()))
tmp_dog_ok.contact_state.unique()

# manage not ok dataframe: split name column and shift the others
pd.set_option('display.max_colwidth', 100) #50
print('before')
display(tmp_dog_not_ok.head(1))
tmp_dog_not_ok_fixed = pd.DataFrame(columns=tmp_dog_not_ok.columns, index=tmp_dog_not_ok.index)
tmp_dog_not_ok_fixed.iloc[:, 0:24] =  tmp_dog_not_ok.iloc[:, 0:24].copy()
tmp_dog_not_ok_fixed.iloc[:, 26:] =  tmp_dog_not_ok.iloc[:, 25:].drop('accessed', axis = 1).copy()
tmp_dog_not_ok.iloc[: , 24]
tmp_dog_not_ok_fixed.name = tmp_dog_not_ok.name.apply(lambda x : x.split('\",')[0])
tmp_dog_not_ok_fixed.status = tmp_dog_not_ok.name.apply(lambda x : x.split('\",')[1].strip('"'))
print('after')
tmp_dog_not_ok_fixed.head()

# unify dataframes
print('tmp_dog_ok shape:', tmp_dog_ok.shape)
print('tmp_dog_not_ok shape:', tmp_dog_not_ok.shape)
dogs = pd.concat([tmp_dog_ok, tmp_dog_not_ok_fixed])
print('dogs shape:', dogs.shape)
del tmp_dog_full
del tmp_dog_not_ok
del tmp_dog_not_ok_fixed
del tmp_dog_ok

dogs.columns = [col.lower().replace(".", "_") for col in dogs.columns]
dogs.drop('ok', axis=1, inplace=True)
dogs.columns


# travels dataset

tmp_travels = pd.read_csv("dogTravel.csv", sep=',', doublequote='"', low_memory=False).drop('index', axis=1)
display(tmp_travels.head())
display(tmp_travels.contact_state.unique())
display(tmp_travels[tmp_travels.contact_state == '17325'].id.unique())
anomalies = tmp_travels[tmp_travels.contact_state == '17325'].id.unique()
tmp_travels.loc[tmp_travels.id == anomalies[0], 'contact_state'] = 'PA'
tmp_travels.loc[tmp_travels.id == anomalies[1], 'contact_state'] = 'PA'
display(tmp_travels[tmp_travels.id.isin(anomalies)])
display(tmp_travels.contact_state.unique())

travels = tmp_travels.copy()
del tmp_travels

# states dataset

tmp_states = pd.read_csv("NST-EST2021-POP.csv", header=None, names=["state", "population"], sep=',', low_memory=False)
tmp_states.head()

tmp_states.population = tmp_states.population.str.replace('.', '', regex=False).astype(int)
states = tmp_states.copy()
del tmp_states
states.head()

### 1. Extract all dogs with status that is not *adoptable*

In [None]:
print(dogs[dogs.status != 'adoptable'].shape)
not_adoptable_dogs = dogs[dogs.status != 'adoptable']

not_adoptable_dogs

### 2. For each (primary) breed, determine the number of dogs

In [None]:
dogs['breed_primary'].value_counts()

### 3. For each (primary) breed, determine the ratio between the number of dogs of `Mixed Breed` and those not of Mixed Breed. Hint: look at the `secondary_breed`.

In [None]:
print('distinct breed: ', len(dogs.breed_primary.unique()))
breeds = dogs.groupby('breed_primary', as_index=False).count()[['id', 'breed_primary']].sort_values(by='id', ascending=False)
breeds

## compute total mixed dogs by primary breed
sec_breeds = dogs[dogs.breed_secondary.notnull()]
sec_breeds = sec_breeds.groupby('breed_primary', as_index=False).count()[['breed_primary','id']]

## compute ratios
mix_breeds = breeds.merge(sec_breeds, on='breed_primary', how='left', suffixes=('_tot','_mixed'))
mix_breeds.id_mixed = mix_breeds.id_mixed.fillna(0)
mix_breeds['mixed_ratio'] = mix_breeds.apply(lambda x : round(x.id_mixed/x.id_tot, 2)*100, axis=1)
mix_breeds['pure_ratio'] = mix_breeds.apply(lambda x : 100 - x.mixed_ratio, axis=1)
mix_breeds

### 4. For each (primary) breed, determine the earliest and the latest `posted` timestamp.

In [None]:
## Formatting the 'posted' column
dogs['posted'] = pd.to_datetime(dogs['posted'], errors="coerce")

## Creating the df with earliest and latest 'posted' timestamps
earliest_latest_timestamp = dogs.groupby('breed_primary', as_index=False).aggregate({'posted':[min, max]})

earliest_latest_timestamp

### 5. For each state, compute the sex imbalance, that is the difference between male and female dogs. In which state this imbalance is largest?

In [None]:
malefemale = dogs[['contact_state', 'contact_city', 'contact_zip', 'contact_country', 'sex']].copy()
malefemale['imbalance'] = malefemale.sex.apply(lambda x : 1 if x.upper() == 'MALE' else -1)

malefemale_imbalance = malefemale.groupby('contact_state', as_index=False).sum('imbalance')[['contact_state', 'imbalance']]
malefemale_imbalance.iloc[[malefemale_imbalance.imbalance.idxmin(), malefemale_imbalance.imbalance.idxmax()]]

### 6. For each pair (age, size), determine the average duration of the stay and the average cost of stay.

In [None]:
dogs.stay_duration = dogs.stay_duration.astype(int)
dogs.stay_cost = dogs.stay_cost.astype(float)
stay = dogs.groupby(['age', 'size'], as_index=False).agg({'stay_duration' : 'mean', 'stay_cost' : 'mean'})
stay.stay_duration = stay.stay_duration.apply(lambda x : round(x, 2))
stay.stay_cost = stay.stay_cost.apply(lambda x : round(x, 2))
stay

### 7. Find the dogs involved in at least 3 travels. Also list the breed of those dogs.

In [None]:
## TODO da capire se ho fatto bene a considerare contact_state o va usato found
many_travels = travels[['id', 'contact_state']].groupby('id', as_index=False).count().rename({'contact_state':'travels'}, axis=1)
many_travels = many_travels[many_travels.travels > 2]
many_travels

In [None]:
more_travels = many_travels.merge(dogs[['id', 'breed_primary']], left_on='id', right_on='id')
more_travels.sort_values('travels', ascending=False)

### 8. Fix the `travels` table so that the correct state is computed from  the `manual` and the `found` fields. If `manual` is not missing, then it overrides what is stored in `found`.

In [None]:
# Creating a copy
exercise_8 = travels.copy()

exercise_8.found = exercise_8.apply(lambda x : x.found if pd.isnull(x['manual']) else x['manual'] ,axis=1)

exercise_8

### 9. For each state, compute the ratio between the number of travels and the population.

In [None]:
# Storing the NST-EST2021-POP.csv into a new df, 'populationsDf'
populationsDf = pd.read_csv("NST-EST2021-POP.csv", sep=',', doublequote='"', low_memory=False, names=["found", "population"])
populationsDf.head()

In [None]:
import re

## TODO infatti se qua ti metti una left join....viene fuori... NARNIA :D 
## TODO invece NARNIA non esce perchè è una inner join, ma perdi 2000 record
## TODO comunque prima di mettere in join exercise_8 bisogna farci su un group by per stato e contare i viaggi.
## TODO inoltre questa mancata groupby è quella che ti fa uscire i duplicati qualche riga più su
# Merge the two dataframes on the 'contact_state' column
print(f'nomber of rows before merge: {exercise_8.shape[0]}')
print(exercise_8.head(100))
exercise_9 = exercise_8.merge(populationsDf, on='found')
print(f'nomber of rows after merge: {exercise_9.shape[0]}')
print(exercise_9['id'])

# Removing duplicate rows based on the 'id' column, keeping the last occurrence of each duplicate row
## TODO questi duplicati vengono fuori per il motivo che scrivo qualche riga più su
exercise_9 = exercise_9.drop_duplicates(subset='id', keep='last')
print(f'nomber of rows after drop: {exercise_9.shape[0]}')

# Group the dataframe by the 'correct_state' column
grouped_df = exercise_9.groupby('found')

# Create an empty dictionary to store the results
results = {}

# Iterate through each group
for name, group in grouped_df:
    # Calculate the number of travels and the population
    num_travels = group.shape[0]
    population = group['population'].str.replace('.', '').astype(int).sum()
    
    # Calculate the ratio and store it in the dictionary
    ratio = num_travels / population
    results[name] = ratio

# Convert the dictionary to a dataframe
results_df = pd.DataFrame.from_dict(results, orient='index', columns=['ratio'])


In [None]:
print(sorted(exercise_8.found.unique()))
results_df

### 10. For each dog, compute the number of days from the `posted` day to the day of last access.

In [None]:
# Creating a df copy for this exercise
exercise_10 = dogs[['id', 'name', 'posted', 'accessed']].copy()

# Computing the number of days from the 'posted' day to the day of last access, assuming it's 'accessed' column
# The value is stored in 'days_delay' column
exercise_10['posted'] = pd.to_datetime(pd.to_datetime(exercise_10['posted']).dt.date)
exercise_10['accessed'] = pd.to_datetime(exercise_10['accessed'])
exercise_10['days_delay'] = (exercise_10['accessed'].dt.date - exercise_10['posted'].dt.date).dt.days

# Printing the result
exercise_10

### 11. Partition the dogs according to the number of weeks from the `posted` day to the day of last access.

In [None]:
# Creating a df copy for this exercise
exercise_11 = exercise_10

# Creating a new column, 'weeks', that stores the number of weeks from the posted day to the day of last access
exercise_11["weeks"] = round(exercise_11["days_delay"] // 7,0).astype(int)

# Grouping the dogs in different partitions, based on 'weeks' value
partitioned_dogs = exercise_11.groupby("weeks").count()[['id']].rename({'id': 'number_of_dogs'}, axis=1)
# # Printing them
partitioned_dogs

### 12. Find for duplicates in the `dogs` dataset. Two records are duplicates if they have (1) same breeds and sex, and (2) they share at least 90% of the words in the description field. Extra points if you find and implement a more refined for determining if two rows are duplicates.

In [None]:
# lowercase, remove punctuation, tokenize, lemmatization
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')
stop.extend(['dog', 'dogs', '-', 'old'])

In [None]:
dogs_12 = dogs[['id', 'breed_primary', 'sex', 'description']].copy()
# dogs_12['description'] = dogs_12['description'].fillna('-')

# filtra il dataframe per escludere i record con valori NaN nella colonna 'description'
dogs_12 = dogs_12[dogs_12['description'].notnull()]
dogs_12['lemm_description'] = dogs_12.description.str.lower().str.replace('[^a-zA-Z0-9 \w+\.\w+@\w+\.\w \w+@\w+\.\w www.\w+\.\w]',' ', regex=True)    
dogs_12['lemm_description'] = dogs_12['lemm_description'].str.lower().str.replace('(\w)(\. )',r'\1 ', regex=True).str.strip('.')  
dogs_12['lemm_description'] = dogs_12['lemm_description'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in stop])) 

In [None]:
from collections import Counter
pd.set_option('display.max_colwidth', 500) #50

dogs_12['cleaned_description'] = dogs_12.lemm_description.str.replace('(\w+)? ?(\d+) (\w+)',r'\1\2\3', regex=True)
dogs_12['cleaned_description'] = dogs_12.cleaned_description.str.replace(' \w ',' ', regex=True)
dogs_12['cleaned_description'] = dogs_12.cleaned_description.str.replace('\s+',' ', regex=True)
print(f'rows before pruning: {len(dogs_12)}')
dogs_12 = dogs_12[dogs_12['cleaned_description'].notnull()]
print(f'rows after pruning: {len(dogs_12)}')
dogs_12['description_counter'] = dogs_12['cleaned_description'].apply(lambda x: dict(Counter(x.split()))) 
dogs_12['description_dictionary'] = dogs_12['description_counter'].apply(lambda x: set(x.keys())) 

In [None]:
#calcolo la percentuale di stopwords sull'intero corpus di descrizioni
all_description_words = dogs_12.description.apply(lambda x : len(str(x).split())).sum()
all_cleaned_words = dogs_12.cleaned_description.apply(lambda x : len(str(x).split())).sum()
ratio_cleaned_words = round(100*all_cleaned_words/all_description_words,2)

print(f"""{ratio_cleaned_words}% of words are stopwords""")

In [None]:
# clusters = dogs_12[['breed_primary', 'sex']].drop_duplicates().shape
# print(f'number of cluster: {clusters}')
print(f'dogs before clustering: {dogs_12.shape[0]}')
dogs_clusters = dogs_12.groupby(['breed_primary', 'sex'])[['id']].count().reset_index().rename(columns={'id':'counts'})
print(f'dogs after clustering: {dogs_clusters.counts.sum()}')
print(f'dogs clusters: {dogs_clusters.shape}')
dogs_clusters

In [None]:
# crea una lista vuota per i duplicati
duplicates = []
threashold = 0.9
counter = 0

# filtro un sesso per volta per ottimizzare i calcoli
for sex in ['Male', 'Female']:
    clusters_by_sex = dogs_clusters[dogs_clusters['sex'] == sex][['breed_primary','counts']]
    dogs_by_sex = dogs_12[dogs_12['sex'] == sex]
    cluster_size = clusters_by_sex.shape[0]
    print(f'sex: {sex}')
    cluster_number = 0
    
    # analizzo un cluster per volta
    for breed_primary, counts in clusters_by_sex.values:
        
        cluster_number = cluster_number + 1 
        print(f'processing cluster number: {cluster_number} of {cluster_size}--> {breed_primary} ({counts})')
        
        
        this_cluster = dogs_by_sex[dogs_by_sex['breed_primary']==breed_primary]       
        # confronta ogni record con quelli successivi nel cluster
        for i in range(0, counts-1):
            first_dog = this_cluster.iloc[i]
            desc1 = first_dog['cleaned_description']
            set1 = first_dog['description_dictionary']
            
            for j in range(i+1, counts):
                counter = counter + 1
                second_dog = this_cluster.iloc[j]
                desc2 = second_dog['cleaned_description']
                set2 = second_dog['description_dictionary']
               
                if desc1 == desc2:
                    duplicates.append({'sex': sex, 'breed_primary':breed_primary, 'first':first_dog['id'], 'second':second_dog['id'], 'overlap_ratio':1})
                else:
                # ...confronta le colonne 'cleaned_description'
                    union = len(set1 | set2)
                    intersect = len(set1 & set2)
                    overlap_ratio = intersect / union
                    duplicates.append({'sex': sex, 'breed_primary':breed_primary, 'first':first_dog['id'], 'second':second_dog['id'], 'overlap_ratio':1})
             
# salva i duplicati
df = pd.DataFrame(duplicates)
df.to_csv(f'clusters/duplicates_full.csv', index=False, sep=',', encoding='utf-8')  
print(counter)