# Python Package Manager Exercises:
1. Read this url and find the 10 most frequent words. romeo_and_juliet = 'http://www.gutenberg.org/files/1112/1112.txt'

In [None]:
import requests
import re
from collections import Counter

romeo_and_juliet = 'http://www.gutenberg.org/files/1112/1112.txt'

response = requests.get(romeo_and_juliet)
text = response.text
def find_most_common_words(text_or_file, num_words):
    if isinstance(text_or_file, str):
        try:
            with open(text_or_file, 'r') as file:
                text = file.read()
        except FileNotFoundError:
            text = text_or_file
    else:
        raise ValueError("The first parameter should be a string representing text or a file path.")
    
    words = re.findall(r'\b\w+\b', text.lower())
    word_counts = Counter(words)
    most_common = word_counts.most_common(num_words)
    
    return most_common

romeo_and_juliet_common_words = find_most_common_words(romeo_and_juliet, 10)
print("Romeo and Juliet:", romeo_and_juliet_common_words)

2. Read the cats API and cats_api = 'https://api.thecatapi.com/v1/breeds' and find :
* i. the min, max, mean, median, standard deviation of cats' weight in metric units.
* ii. the min, max, mean, median, standard deviation of cats' lifespan in years.
* iii. Create a frequency table of country and breed of cats


In [None]:
import requests
import pandas as pd

cats_api = 'https://api.thecatapi.com/v1/breeds'

response = requests.get(cats_api)
cats_data = response.json()

# Create a DataFrame from the cats data
cats_df = pd.DataFrame(cats_data)

# Extract weight and lifespan information
cats_df['weight_metric'] = cats_df['weight'].apply(lambda x: x['metric'])
cats_df['weight_min'] = cats_df['weight_metric'].apply(lambda x: float(x.split(' - ')[0]))
cats_df['weight_max'] = cats_df['weight_metric'].apply(lambda x: float(x.split(' - ')[1]))
cats_df['lifespan'] = cats_df['life_span'].apply(lambda x: x.split(' - '))
cats_df['lifespan_min'] = cats_df['lifespan'].apply(lambda x: float(x[0]))
cats_df['lifespan_max'] = cats_df['lifespan'].apply(lambda x: float(x[1]))

# Calculate min, max, mean, median, and standard deviation for weight
weight_min = cats_df['weight_min'].min()
weight_max = cats_df['weight_max'].max()
weight_mean = cats_df[['weight_min', 'weight_max']].mean().mean()
weight_median = cats_df[['weight_min', 'weight_max']].median().median()
weight_std = cats_df[['weight_min', 'weight_max']].std().std()

# Calculate min, max, mean, median, and standard deviation for lifespan
lifespan_min = cats_df['lifespan_min'].min()
lifespan_max = cats_df['lifespan_max'].max()
lifespan_mean = cats_df[['lifespan_min', 'lifespan_max']].mean().mean()
lifespan_median = cats_df[['lifespan_min', 'lifespan_max']].median().median()
lifespan_std = cats_df[['lifespan_min', 'lifespan_max']].std().std()

# Create a frequency table of country and breed of cats
frequency_table = cats_df.groupby('origin')['name'].count().reset_index().rename(columns={'name': 'breed_count'})

print(f"Weight (metric units) - Min: {weight_min}, Max: {weight_max}, Mean: {weight_mean}, Median: {weight_median}, Std: {weight_std}")
print(f"Lifespan (years) - Min: {lifespan_min}, Max: {lifespan_max}, Mean: {lifespan_mean}, Median: {lifespan_median}, Std: {lifespan_std}")
print(frequency_table)

3. Read the countries API and find
* the 10 largest countries
* the 10 most spoken languages
* the total number of languages in the countries API

In [None]:
import requests
import pandas as pd

countries_api = 'https://restcountries.com/v3.1/all'

response = requests.get(countries_api)
countries_data = response.json()

countries_df = pd.DataFrame(countries_data)

countries_df['area'] = countries_df['area'].astype(float)
countries_df['languages'] = countries_df['languages'].apply(lambda x: list(x.values()) if isinstance(x, dict) else [])

# 10 largest countries by area
largest_countries = countries_df.nlargest(10, 'area')[['name', 'area']]

# 10 most spoken languages
all_languages = countries_df['languages'].explode()
most_spoken_languages = all_languages.value_counts().nlargest(10)

# the total number of languages
total_languages = all_languages.nunique()

print("10 Largest Countries by Area:")
print(largest_countries)

print("\n10 Most Spoken Languages:")
print(most_spoken_languages)

print("\nTotal Number of Languages:")
print(total_languages)

4. UCI is one of the most common places to get data sets for data science and machine learning. Read the content of UCL (https://archive.ics.uci.edu/ml/datasets.php). Without additional libraries it will be difficult, so you may try it with BeautifulSoup4

In [9]:
import requests
from bs4 import BeautifulSoup

uci_url = 'https://archive.ics.uci.edu/ml/datasets.php'

response = requests.get(uci_url)
soup = BeautifulSoup(response.content, 'html.parser')

# Extract dataset names and links
datasets = []
for dataset in soup.find_all('table', {'border': '1', 'cellpadding': '5'}):
    for row in dataset.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) > 0:
            dataset_name = columns[0].text.strip()
            dataset_link = 'https://archive.ics.uci.edu/ml/datasets.php' + columns[0].find('a')['href']
            datasets.append((dataset_name, dataset_link))

# Print the first 10 datasets
for name, link in datasets[:10]:
    print(f"Dataset Name: {name}, Link: {link}")