# Exercise: Level 1

##### Question 1

In [3]:
# Read this url and find the 10 most frequent words. romeo_and_juliet = 'http://www.gutenberg.org/files/1112/1112.txt'
import requests
from collections import Counter
import re

# Fetch the text from the URL
url = 'http://www.gutenberg.org/files/1112/1112.txt'
response = requests.get(url)
if response.status_code == 200:
    text = response.text
else:
    print("Failed to fetch the text. Status code:", response.status_code)
    text = ""

# Clean the text and find the most frequent words
def get_most_common_words(text, n=10):
    words = re.findall(r'\b\w+\b', text.lower())
    word_counts = Counter(words)
    most_common_words = word_counts.most_common(n)
    return most_common_words

most_frequent_words = get_most_common_words(text, 10)

print("The ten most frequent words in 'Romeo and Juliet':")
for word, count in most_frequent_words:
    print(f"{word}: {count} times")

Failed to fetch the text. Status code: 404
The ten most frequent words in 'Romeo and Juliet':


##### Question 2

In [4]:
# Read the cats API and cats_api = 'https://api.thecatapi.com/v1/breeds' and find :
# the min, max, mean, median, standard deviation of cats' weight in metric units.
# the min, max, mean, median, standard deviation of cats' lifespan in years.
# Create a frequency table of country and breed of cats
import requests
import statistics

cats_api = 'https://api.thecatapi.com/v1/breeds'

# Fetch data from the Cat API
response = requests.get(cats_api)
if response.status_code == 200:
    cat_data = response.json()
else:
    print("Failed to fetch cat data. Status code:", response.status_code)
    cat_data = []

# Extract weight and lifespan data
weights_metric = [cat.get('weight').get('metric') for cat in cat_data if cat.get('weight')]
lifespans_years = [cat.get('life_span') for cat in cat_data]

# Convert weight strings to numeric values (taking the average if a range is provided)
weights_numeric = []
for weight in weights_metric:
    parts = weight.split('-')
    if len(parts) == 1:
        weights_numeric.append(float(parts[0]))
    else:
        weights_numeric.append((float(parts[0]) + float(parts[1])) / 2)

# Convert lifespan strings to numeric values (taking the average if a range is provided)
lifespans_numeric = []
for lifespan in lifespans_years:
    parts = lifespan.split('-')
    if len(parts) == 1:
        lifespans_numeric.append(float(parts[0]))
    else:
        lifespans_numeric.append((float(parts[0]) + float(parts[1])) / 2)

# Calculate statistics
weight_stats = {
    'min': min(weights_numeric),
    'max': max(weights_numeric),
    'mean': statistics.mean(weights_numeric),
    'median': statistics.median(weights_numeric),
    'std_dev': statistics.stdev(weights_numeric)
}

lifespan_stats = {
    'min': min(lifespans_numeric),
    'max': max(lifespans_numeric),
    'mean': statistics.mean(lifespans_numeric),
    'median': statistics.median(lifespans_numeric),
    'std_dev': statistics.stdev(lifespans_numeric)
}

# Create a frequency table of country and breed
freq_table = {}
for cat in cat_data:
    country = cat.get('origin')
    breed = cat.get('name')
    if country and breed:
        key = f"{country} - {breed}"
        freq_table[key] = freq_table.get(key, 0) + 1

# Print results
print("Statistics for cat weight (metric units):", weight_stats)
print("Statistics for cat lifespan (years):", lifespan_stats)
print("\nFrequency table of country and breed:")
for key, value in freq_table.items():
    print(f"{key}: {value} occurrences")

Statistics for cat weight (metric units): {'min': 3.0, 'max': 7.5, 'mean': 4.708955223880597, 'median': 4.5, 'std_dev': 1.066533799956462}
Statistics for cat lifespan (years): {'min': 10.5, 'max': 19.0, 'mean': 13.746268656716419, 'median': 13.5, 'std_dev': 1.5844249849048053}

Frequency table of country and breed:
Egypt - Abyssinian: 1 occurrences
Greece - Aegean: 1 occurrences
United States - American Bobtail: 1 occurrences
United States - American Curl: 1 occurrences
United States - American Shorthair: 1 occurrences
United States - American Wirehair: 1 occurrences
United Arab Emirates - Arabian Mau: 1 occurrences
Australia - Australian Mist: 1 occurrences
United States - Balinese: 1 occurrences
United States - Bambino: 1 occurrences
United States - Bengal: 1 occurrences
France - Birman: 1 occurrences
United States - Bombay: 1 occurrences
United Kingdom - British Longhair: 1 occurrences
United Kingdom - British Shorthair: 1 occurrences
Burma - Burmese: 1 occurrences
United Kingdom - 

##### Question 3

In [6]:
# Read the countries API and find
# the 10 largest countries
# the 10 most spoken languages
# the total number of languages in the countries API
import requests

# Fetch data from the countries API
countries_api = 'https://restcountries.com/v3.1/all'
response = requests.get(countries_api)
if response.status_code == 200:
    countries_data = response.json()
else:
    print("Failed to fetch countries data. Status code:", response.status_code)
    countries_data = []

# Extract country sizes and languages
country_sizes = {}
spoken_languages = []

for country in countries_data:
    if 'name' in country and 'area' in country:
        country_sizes[country['name']['common']] = country['area']

    if 'languages' in country:
        spoken_languages.extend(country['languages'].keys())

# Calculate the 10 largest countries
largest_countries = sorted(country_sizes.items(), key=lambda x: x[1], reverse=True)[:10]

# Calculate the 10 most spoken languages
language_count = {}
for language in spoken_languages:
    language_count[language] = language_count.get(language, 0) + 1

most_spoken_languages = sorted(language_count.items(), key=lambda x: x[1], reverse=True)[:10]

# Calculate the total number of languages
total_languages = len(set(spoken_languages))

# Display results
print("10 Largest Countries:")
for country, size in largest_countries:
    print(f"{country}: {size} square kilometers")

print("\n10 Most Spoken Languages:")
for language, count in most_spoken_languages:
    print(f"{language}: {count} countries")

print(f"\nTotal Number of Languages: {total_languages}")

10 Largest Countries:
Russia: 17098242.0 square kilometers
Antarctica: 14000000.0 square kilometers
Canada: 9984670.0 square kilometers
China: 9706961.0 square kilometers
United States: 9372610.0 square kilometers
Brazil: 8515767.0 square kilometers
Australia: 7692024.0 square kilometers
India: 3287590.0 square kilometers
Argentina: 2780400.0 square kilometers
Kazakhstan: 2724900.0 square kilometers

10 Most Spoken Languages:
eng: 91 countries
fra: 46 countries
ara: 25 countries
spa: 24 countries
por: 10 countries
nld: 7 countries
rus: 7 countries
deu: 5 countries
zho: 5 countries
tsn: 4 countries

Total Number of Languages: 155


##### Question 4

In [16]:
import requests
from bs4 import BeautifulSoup

# URL of the dataset page
url = 'https://archive.ics.uci.edu/ml/datasets/Car+Evaluation'

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find and print the main content of the page (for example, the title)
    title = soup.find('title')
    if title:
        print("Title of the page:", title.text)
    else:
        print("Title not found.")

    # You can find and extract other elements of the page using BeautifulSoup methods
    # For example, finding all paragraph elements
    paragraphs = soup.find_all('p')
    for paragraph in paragraphs:
        print(paragraph.text)
else:
    print("Failed to retrieve the webpage")

Title of the page: UCI Machine Learning Repository
 Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.
Multivariate
Other
Classification
Categorical
1728
6
Additional Information
Car Evaluation Database was derived from a simple hierarchical decision model originally developed for the demonstration of DEX, M. Bohanec, V. Rajkovic: Expert system for decision making. Sistemica 1(1), pp. 145-157, 1990.). The model evaluates cars according to the following concept structure:

CAR                      car acceptability
. PRICE                  overall price
. . buying               buying price
. . maint                price of the maintenance
. TECH                   technical characteristics
. . COMFORT              comfort
. . . doors              number of doors
. . . persons            capacity in terms of persons to carry
. . . lug_boot           the size of luggage boot
. . safety           