# Overview

This notebook sets up the following directories for the final project.
```
project
├── data
├── LICENSE
├── models
├── notebooks
│   └── 0_setup_project_folders.ipynb
├── README.md
├── report
│   ├── interim.md
│   ├── proposal.md
│   └── report.md
└── src
```

You are free to rename any the directory or add other directories. You can also switch the LICENSE to any other license type tha you prefer.

In [1]:
import os

basedir = os.path.dirname(os.getcwd())

subdirs = ['src', 'data', 'models']
for d in subdirs:
    full_path = os.path.join(basedir, d)
    if not os.path.exists(full_path):
        os.makedirs(full_path)

assert all([os.path.exists(os.path.join(basedir, d)) for d in subdirs])

In [2]:
!tree ../

[38;5;33m../[0m
├── [38;5;33mcpsc6300[0m
│   └── [38;5;33mrhew-holder-rodgers[0m
├── [38;5;33mdata[0m
├── LICENSE
├── List_of_Doggo_Names.csv
├── [38;5;33mmodels[0m
├── [38;5;33mnotebooks[0m
│   ├── 0_setup_project_folders.ipynb
│   ├── Clean and Summarize Dog Data.ipynb
│   ├── Dog_Breeds_With_Rankings.csv
│   ├── List_of_Doggo_Names.csv
│   └── Name_List.py
├── README.md
├── [38;5;33mreport[0m
│   ├── interim.md
│   ├── proposal.md
│   └── report.md
├── [38;5;33msrc[0m
└── untitled.txt

7 directories, 12 files


In [3]:
from bs4 import BeautifulSoup
import requests

In [4]:
'''First we are trying to get data from one page for one breed'''
url = 'https://dogtime.com/dog-breeds/border-collie'
url_html = requests.get(url)
soup = BeautifulSoup(url_html.text, 'html.parser')
print(type(soup))

<class 'bs4.BeautifulSoup'>


In [5]:
'''Using the BeuatifulSoup object we created above, we are parsing for characteristics and star quality associated with them on a scale of 1-5'''

'Using the BeuatifulSoup object we created above, we are parsing for characteristics and star quality associated with them on a scale of 1-5'

In [6]:
'''We are creating lists for the attributes and star rankings of the Border Collie'''

'We are creating lists for the attributes and star rankings of the Border Collie'

In [7]:
titles_html = soup.find_all('div', {'class':'characteristic-stars parent-characteristic'})
titles = []
star_ranks = []
for html_obj in titles_html:
    title = html_obj('h3', {'class': 'characteristic-title'})
    titles.append(title[0].string[1:])
    star_rank = html_obj('div', {'class': 'star'})
    star_rank[0] = star_rank[0]["class"]
    star_ranks.append(int(star_rank[0][1][-1]))
print(titles)
print(star_ranks)


['Adaptability', 'All Around Friendliness', 'Health And Grooming Needs', 'Trainability', 'Physical Needs']
[3, 4, 3, 4, 5]


In [8]:
'''Now we are going to make a function to automate this process with a lot of dog breeds'''

'Now we are going to make a function to automate this process with a lot of dog breeds'

In [9]:
def get_url(dog_breed):
    return ('https://dogtime.com/dog-breeds/' + dog_breed)

In [10]:
'''test get_url function... file with dog breeds includes dashes in name'''

'test get_url function... file with dog breeds includes dashes in name'

In [11]:
url_d = get_url('american-border-collie')
url_d

'https://dogtime.com/dog-breeds/american-border-collie'

In [12]:
'''create function to get attributes and rankings based on url'''

'create function to get attributes and rankings based on url'

In [13]:
import pandas as pd
def get_attributes(url):
    url_html = requests.get(url)
    soup = BeautifulSoup(url_html.text, 'html.parser')
    
    titles_html = soup.find_all('div', {'class':'characteristic-stars parent-characteristic'})
    titles = []
    star_ranks = []
    for html_obj in titles_html:
        title = html_obj('h3', {'class': 'characteristic-title'})
        titles.append(title[0].string[1:])
        star_rank = html_obj('div', {'class': 'star'})
        star_rank[0] = star_rank[0]["class"]
        star_ranks.append(star_rank[0][1][-1])
        
    attributes = pd.Series(index=titles, data=star_ranks, dtype = object)
    return attributes
    

In [14]:
'''test get attributes'''

'test get attributes'

In [15]:
print(get_attributes(get_url('Border-collie')))

Adaptability                 3
All Around Friendliness      4
Health And Grooming Needs    3
Trainability                 4
Physical Needs               5
dtype: object


In [16]:
'''Now we will use a function to generate a list of dog breeds'''

'Now we will use a function to generate a list of dog breeds'

In [17]:
def get_list_of_names(csvFile):
    """
    This function reads in a list of names from a csv file 

    Args:
        csvFile: The csv file with the list of names

    Returns:
        listOfNames: Pandas Series of names
    """
    listOfNames = pd.read_csv(csvFile, index_col=False, header=0)
    

    # if(!listOfNames.len()):
    #     print("list of names did not import correctly") 
    #     return

    return listOfNames

In [18]:
'''now we will get the list of names using a CSV of names'''

'now we will get the list of names using a CSV of names'

In [19]:
dog_df = get_list_of_names('List_of_Doggo_Names.csv')
dog_df.head(5)
dog_list = dog_df.Breeds.tolist()
dog_list

['Afador',
 'Affenhuahua',
 'Affenpinscher',
 'Afghan-Hound',
 'Airedale-Terrier',
 'Akbash',
 'Akita',
 'Akita-Chow',
 'Akita-Pit',
 'Akita-Shepherd',
 'Alaskan-Klee-Kai',
 'Alaskan-Malamute',
 'American-Bulldog',
 'American-English-Coonhound',
 'American-Eskimo-Dog',
 'American-Foxhound',
 'American-Hairless-Terrier',
 'American-Leopard-Hound',
 'American-Pit-Bull-Terrier',
 'American-Pugabull',
 'American-Staffordshire-Terrier',
 'American-Water-Spaniel',
 'Anatolian-Shepherd-Dog',
 'Appenzeller-Sennenhunde',
 'Auggie',
 'Aussiedoodle',
 'Aussiepom',
 'Australian-Cattle-Dog',
 'Australian-Kelpie',
 'Australian-Retriever',
 'Australian-Shepherd',
 'Australian-Shepherd-Husky',
 'Australian-Shepherd-Lab-Mix',
 'Australian-Shepherd-Pit-Bull-Mix',
 'Australian-Stumpy-Tail-Cattle-Dog',
 'Australian-Terrier',
 'Azawakh',
 'Barbet',
 'Basenji',
 'Bassador',
 'Basset-Fauve-de-Bretagne',
 'Basset-Hound',
 'Basset-Retriever',
 'Bavarian-Mountain-Scent-Hound',
 'Beabull',
 'Beagle',
 'Beaglier'

In [20]:
'''now that we that have a list of dogs, we will loop through them and get the attributes and rankings for each of them.

We need to ensure that a url exists for the dog breed'''

'now that we that have a list of dogs, we will loop through them and get the attributes and rankings for each of them.\n\nWe need to ensure that a url exists for the dog breed'

In [21]:
list_of_attributes = []

for dog in dog_list:
    row = []
    row.append(dog)
    print(dog)
    url = get_url(dog)
    series = get_attributes(url)
    for rank in series:
        row.append(rank)
    list_of_attributes.append(row)
    
list_of_attributes

Afador
Affenhuahua
Affenpinscher
Afghan-Hound
Airedale-Terrier
Akbash
Akita
Akita-Chow
Akita-Pit
Akita-Shepherd
Alaskan-Klee-Kai
Alaskan-Malamute
American-Bulldog
American-English-Coonhound
American-Eskimo-Dog
American-Foxhound
American-Hairless-Terrier
American-Leopard-Hound
American-Pit-Bull-Terrier
American-Pugabull
American-Staffordshire-Terrier
American-Water-Spaniel
Anatolian-Shepherd-Dog
Appenzeller-Sennenhunde
Auggie
Aussiedoodle
Aussiepom
Australian-Cattle-Dog
Australian-Kelpie
Australian-Retriever
Australian-Shepherd
Australian-Shepherd-Husky
Australian-Shepherd-Lab-Mix
Australian-Shepherd-Pit-Bull-Mix
Australian-Stumpy-Tail-Cattle-Dog
Australian-Terrier
Azawakh
Barbet
Basenji
Bassador
Basset-Fauve-de-Bretagne
Basset-Hound
Basset-Retriever
Bavarian-Mountain-Scent-Hound
Beabull
Beagle
Beaglier
Bearded-Collie
Bedlington-Terrier
Belgian-Malinois
Belgian-Sheepdog
Belgian-Tervuren
Bergamasco-Sheepdog
Berger-Picard
Bernedoodle
Bernese-Mountain-Dog
Bichon-Frise
Biewer-Terrier
Black-

[['Afador', '2', '3', '3', '4', '4'],
 ['Affenhuahua', '3', '3', '3', '3', '3'],
 ['Affenpinscher', '3', '3', '2', '3', '4'],
 ['Afghan-Hound', '4', '4', '2', '3', '4'],
 ['Airedale-Terrier', '2', '4', '3', '5', '5'],
 ['Akbash', '3', '4', '4', '3', '2'],
 ['Akita', '3', '2', '4', '4', '4'],
 ['Akita-Chow', '3', '1', '3', '3', '4'],
 ['Akita-Pit', '3', '3', '4', '3', '3'],
 ['Akita-Shepherd', '3', '3', '3', '3', '3'],
 ['Alaskan-Klee-Kai', '3', '3', '3', '4', '4'],
 ['Alaskan-Malamute', '2', '4', '3', '4', '5'],
 ['American-Bulldog', '2', '3', '3', '3', '4'],
 ['American-English-Coonhound', '3', '5', '3', '5', '5'],
 ['American-Eskimo-Dog', '4', '5', '3', '4', '4'],
 ['American-Foxhound', '2', '5', '4', '4', '5'],
 ['American-Hairless-Terrier', '3', '4', '3', '3', '4'],
 ['American-Leopard-Hound', '3', '4', '3', '4', '4'],
 ['American-Pit-Bull-Terrier', '3', '4', '3', '4', '5'],
 ['American-Pugabull', '3', '4', '3', '3', '4'],
 ['American-Staffordshire-Terrier', '2', '4', '3', '4', '4'

In [22]:
'''create dataframe with titles as column headings'''
titles.insert(0,'Breed')
dog_df = pd.DataFrame(list_of_attributes, columns=titles)

In [23]:
dog_df = dog_df.set_index('Breed')
print(dog_df.shape)
dog_df

(381, 5)


Unnamed: 0_level_0,Adaptability,All Around Friendliness,Health And Grooming Needs,Trainability,Physical Needs
Breed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afador,2,3,3,4,4
Affenhuahua,3,3,3,3,3
Affenpinscher,3,3,2,3,4
Afghan-Hound,4,4,2,3,4
Airedale-Terrier,2,4,3,5,5
...,...,...,...,...,...
Whoodle,3,4,3,2,4
Wirehaired-Pointing-Griffon,3,5,3,4,4
Xoloitzcuintli,,,,,
Yorkipoo,4,4,2,4,4


In [24]:
dog_df.to_csv('Dog_Breeds_With_Rankings.csv')

In [25]:
"""Now we have a completed dataframe with over 350 dog breeds and rankings for 5 categories that will help us later rank these dogs based on a variety of things that an owner is looking for in a dog.

This is saved to the file 'Dog_Breeds_With_Rankings.csv' that we will use later on"""

"Now we have a completed dataframe with over 350 dog breeds and rankings for 5 categories that will help us later rank these dogs based on a variety of things that an owner is looking for in a dog.\n\nThis is saved to the file 'Dog_Breeds_With_Rankings.csv' that we will use later on"