# Exercise Day 20: PIP

## Exercise no. 1

In [11]:
import requests
from bs4 import BeautifulSoup
from collections import Counter

# URL of the text
romeo_and_juliet_url = 'http://www.gutenberg.org/files/1112/1112.txt'

# Send a GET request to the URL
response = requests.get(romeo_and_juliet_url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract text from the HTML content
    text = soup.get_text()

    # Tokenize the text into words
    words = text.split()

    # Count the occurrences of each word
    word_counts = Counter(words)

    # Get the 10 most common words
    most_common_words = word_counts.most_common(10)

    # Print the result
    print(most_common_words)

else:
    print(f"Error: Unable to retrieve content from the URL. Status code: {response.status_code}")


Error: Unable to retrieve content from the URL. Status code: 404


## Exercise no. 2

In [12]:
import requests
import statistics

# URL of the cats API
cats_api = 'https://api.thecatapi.com/v1/breeds'

# Send a GET request to the API
response = requests.get(cats_api)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the JSON data from the response
    cat_data = response.json()

    # Extract weights in metric units and convert to a list of floats
    weights = [float(cat.get('weight', {}).get('metric', '').split()[0]) for cat in cat_data]

    # Calculate statistics
    min_weight = min(weights)
    max_weight = max(weights)
    mean_weight = statistics.mean(weights)
    median_weight = statistics.median(weights)
    std_dev_weight = statistics.stdev(weights)

    # Print the results
    print(f"Minimum Weight: {min_weight} kg")
    print(f"Maximum Weight: {max_weight} kg")
    print(f"Mean Weight: {mean_weight} kg")
    print(f"Median Weight: {median_weight} kg")
    print(f"Standard Deviation of Weight: {std_dev_weight} kg")

else:
    print(f"Error: Unable to retrieve data from the cats API. Status code: {response.status_code}")


Minimum Weight: 2.0 kg
Maximum Weight: 5.0 kg
Mean Weight: 3.2238805970149254 kg
Median Weight: 3.0 kg
Standard Deviation of Weight: 0.8845628182703051 kg


In [13]:
import requests
import statistics

# URL of the cats API
cats_api = 'https://api.thecatapi.com/v1/breeds'

# Send a GET request to the API
response = requests.get(cats_api)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the JSON data from the response
    cat_data = response.json()

    # Extract lifespan in years and convert to a list of floats
    lifespans = [float(cat.get('life_span', '').split()[0]) for cat in cat_data if cat.get('life_span', '')]

    # Calculate statistics
    min_lifespan = min(lifespans)
    max_lifespan = max(lifespans)
    mean_lifespan = statistics.mean(lifespans)
    median_lifespan = statistics.median(lifespans)
    std_dev_lifespan = statistics.stdev(lifespans)

    # Print the results
    print(f"Minimum Lifespan: {min_lifespan} years")
    print(f"Maximum Lifespan: {max_lifespan} years")
    print(f"Mean Lifespan: {mean_lifespan} years")
    print(f"Median Lifespan: {median_lifespan} years")
    print(f"Standard Deviation of Lifespan: {std_dev_lifespan} years")

else:
    print(f"Error: Unable to retrieve data from the cats API. Status code: {response.status_code}")



Minimum Lifespan: 8.0 years
Maximum Lifespan: 18.0 years
Mean Lifespan: 12.074626865671641 years
Median Lifespan: 12.0 years
Standard Deviation of Lifespan: 1.8283411328456127 years


In [14]:

import requests
from collections import Counter

# URL of the cats API
cats_api = 'https://api.thecatapi.com/v1/breeds'

# Send a GET request to the API
response = requests.get(cats_api)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the JSON data from the response
    cat_data = response.json()

    # Extract country and breed information and create a frequency table
    country_breed_freq = Counter((cat.get('origin', ''), cat.get('name', '')) for cat in cat_data)

    # Print the frequency table
    print("Frequency Table of Country and Breed of Cats:")
    print("{:<20} {:<30} {:<10}".format("Country", "Breed", "Frequency"))
    print("="*60)
    for (country, breed), frequency in country_breed_freq.items():
        print("{:<20} {:<30} {:<10}".format(country, breed, frequency))

else:
    print(f"Error: Unable to retrieve data from the cats API. Status code: {response.status_code}")



import requests

# URL of the countries API
countries_api = 'https://restcountries.com/v2/all'

# Send a GET request to the API
response = requests.get(countries_api)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the JSON data from the response
    countries_data = response.json()

    # Sort countries based on total area in descending order
    sorted_countries = sorted(countries_data, key=lambda x: x.get('area', 0), reverse=True)

    # Get the top 10 largest countries
    top_10_largest_countries = sorted_countries[:10]

    # Print the list of 10 largest countries
    print("Top 10 Largest Countries:")
    print("{:<3} {:<40} {:<15}".format("Rank", "Country", "Total Area (sq km)"))
    print("="*60)
    for i, country in enumerate(top_10_largest_countries, start=1):
        country_name = country.get('name', '')
        total_area = country.get('area', 0)
        print("{:<3} {:<40} {:<15}".format(i, country_name, total_area))

else:
    print(f"Error: Unable to retrieve data from the countries API. Status code: {response.status_code}")


Frequency Table of Country and Breed of Cats:
Country              Breed                          Frequency 
Egypt                Abyssinian                     1         
Greece               Aegean                         1         
United States        American Bobtail               1         
United States        American Curl                  1         
United States        American Shorthair             1         
United States        American Wirehair              1         
United Arab Emirates Arabian Mau                    1         
Australia            Australian Mist                1         
United States        Balinese                       1         
United States        Bambino                        1         
United States        Bengal                         1         
France               Birman                         1         
United States        Bombay                         1         
United Kingdom       British Longhair               1         
United Ki

## Exercise no. 3

In [15]:

import requests

# URL of the countries API
countries_api = 'https://restcountries.com/v2/all'

# Send a GET request to the API
response = requests.get(countries_api)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the JSON data from the response
    countries_data = response.json()

    # Extract language names from all countries
    all_languages = [language['name'] for country in countries_data for language in country.get('languages', [])]

    # Create a frequency table for languages
    language_frequency = {}
    for language in all_languages:
        language_frequency[language] = language_frequency.get(language, 0) + 1

    # Sort languages based on frequency in descending order
    sorted_languages = sorted(language_frequency.items(), key=lambda x: x[1], reverse=True)

    # Get the top 10 most spoken languages
    top_10_languages = sorted_languages[:10]

    # Print the list of 10 most spoken languages
    print("Top 10 Most Spoken Languages:")
    print("{:<3} {:<25} {:<10}".format("Rank", "Language", "Frequency"))
    print("="*50)
    for i, (language, frequency) in enumerate(top_10_languages, start=1):
        print("{:<3} {:<25} {:<10}".format(i, language, frequency))

else:
    print(f"Error: Unable to retrieve data from the countries API. Status code: {response.status_code}")


Top 10 Most Spoken Languages:
Rank Language                  Frequency 
1   English                   91        
2   French                    45        
3   Arabic                    25        
4   Spanish                   24        
5   Portuguese                10        
6   Russian                   8         
7   Dutch                     8         
8   German                    7         
9   Chinese                   5         
10  Serbian                   4         


In [16]:
import requests

# URL of the countries API
countries_api = 'https://restcountries.com/v2/all'

# Send a GET request to the API
response = requests.get(countries_api)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the JSON data from the response
    countries_data = response.json()

    # Extract language codes from all countries
    all_languages = [language.get('iso639_1', None) for country in countries_data for language in country.get('languages', [])]

    # Remove None values
    all_languages = [lang for lang in all_languages if lang is not None]

    # Get the total number of unique languages
    total_languages = len(set(all_languages))

    print(f"Total number of languages in the countries API: {total_languages}")

else:
    print(f"Error: Unable to retrieve data from the countries API. Status code: {response.status_code}")



Total number of languages in the countries API: 112


## Exercise no. 4

In [22]:
import requests
from bs4 import BeautifulSoup

def get_ucl_datasets(url):
    try:
        # Send a GET request to the UCL webpage
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad responses

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        dataset_links = [a['href'] for a in soup.find_all('a', href=True) if 'dataset' in a['href'].lower()]

        return dataset_links
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")

ucl_url = 'https://archive.ics.uci.edu/ml/datasets.php'
datasets = get_ucl_datasets(ucl_url)

if datasets:
    print("Datasets found:")
    for dataset in datasets:
        print(dataset)


Error: 404 Client Error: Not Found for url: https://archive.ics.uci.edu/datasets.php
