<a href="https://colab.research.google.com/github/Anissa7/Anissa.github.io/blob/main/Web_scapping_assignment_Group_8_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 0

In [None]:
import numpy as np
import pandas as pd
from bs4  import BeautifulSoup
import requests
import os
from PIL import Image
from io import BytesIO


In [None]:

def availableShips(passengerCount):
    """
    Method that returns the list of ships
    that can hold a given number of passengers.

    Args:
        passengerCount (int): number of passengers
        to be transported

    Returns:
        list: List of ship names that can hold the given number of passengers
    """
    url_link = 'https://swapi-api.alx-tools.com/api/starships/'
    list_ships = []

    while url_link:
        response = requests.get(url_link)

        # Handle error incase request fails
        if response.status_code != 200:
            print(f"Failed to retrieve data: {response.status_code}")
            break

        data = response.json()

        for ship in data['results']:
            # Skip ships with invalid passenger values
            if ship["passengers"] not in ["n/a", "unknown", "0", "none"]:
                try:
                    ship["passengers"] = ship["passengers"].replace(",", "")
                    if int(ship['passengers']) >= passengerCount:
                        list_ships.append(ship['name'])
                except ValueError:
                    # Handle unexpected non-numeric passenger values
                    pass

        # Move to the next page if available
        url_link = data['next']

    return list_ships

# Check for an example of a ship with 300 passengers

ship=availableShips(300)
print(ship)


['CR90 corvette', 'Death Star', 'Executor', 'Calamari Cruiser', 'Droid control ship', 'AA-9 Coruscant freighter', 'Republic Assault ship', 'Trade Federation cruiser', 'Republic attack cruiser']


# Task 1
 Web Scraping- Scrap this site
 Scraping the tabular data to CSV file

### Import nessecary libraries

## Use BeautifulSoup to scrape the data from URL

In [None]:
# url to scrape
url="https://www.scrapethissite.com/pages/forms/"

# a request to get the page
page=requests.get(url)

# parse the Html content
soup=BeautifulSoup(page.content,"html.parser")

## Find the table on the url and extract the rows and columns

In [None]:
# find the table
table=soup.find("table")

# find all rows
rows=table.find_all("tr")

# extract the columns
columns=[v.text.strip() for v in rows[0].find_all("th")]

In [None]:
# print(rows)
print(columns)

['Team Name', 'Year', 'Wins', 'Losses', 'OT Losses', 'Win %', 'Goals For (GF)', 'Goals Against (GA)', '+ / -']


In [None]:
# extract the data rows
data=[]
for row in rows[1:]:
    values=[v.text.strip() for v in row.find_all("td")]
    data.append(values)

## Creating a DataFrame using pandas

In [None]:
# create the dataframe
df=pd.DataFrame(data,columns=columns)

# print 5 first rows
print(df.head())

            Team Name  Year Wins Losses OT Losses  Win % Goals For (GF)  \
0       Boston Bruins  1990   44     24             0.55            299   
1      Buffalo Sabres  1990   31     30            0.388            292   
2      Calgary Flames  1990   46     26            0.575            344   
3  Chicago Blackhawks  1990   49     23            0.613            284   
4   Detroit Red Wings  1990   34     38            0.425            273   

  Goals Against (GA) + / -  
0                264    35  
1                278    14  
2                263    81  
3                211    73  
4                298   -25  


## save the dataset to CSV

In [None]:
df.to_csv("data.csv",index=False)

**Amzon Webscrapping**

Categoies:
- Electronics
- Baby
- Shoes
- Watches
-Cameras

In [None]:
'''
This Script scrapes images from categories in Amzon website and lables the in to folders, with 5 images per category for all 5 categories
'''

# Function to save the product image
def save_image(image_url, product_name, category_name):
    response = requests.get(image_url)
    img = Image.open(BytesIO(response.content))
    img_name = product_name[:20].replace(' ', '_').replace('/', '_') + ".jpg"

    # Ensure the directory exists
    directory = os.path.join('images', category_name)
    os.makedirs(directory, exist_ok=True)

    # Save the image
    img_path = os.path.join(directory, img_name)
    img.save(img_path)
    print(f"Saved image: {img_path}")

# Function to scrape Amazon product data
def scrape_amazon(url, category_name):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9'
    }

    response = requests.get(url, headers=headers)
    print(f"Status Code: {response.status_code}")

    if response.status_code != 200:
        print(f"Error: Unable to access {url}")
        return []

    soup = BeautifulSoup(response.content, 'lxml')
    products = soup.find_all('div', {'data-component-type': 's-search-result'})

    product_list = []

    for product in products[:5]:  # Limit to 5 products
        try:
            # Extract product name
            product_name = product.h2.text.strip()

            # Extract product image URL
            image_tag = product.find('img', {'class': 's-image'})
            image_url = image_tag['src'] if image_tag else None

            if image_url:
                save_image(image_url, product_name, category_name)

            product_list.append({
                'name': product_name,
                'image_url': image_url,
                'category': category_name
            })

            print(f"Product: {product_name}\nImage URL: {image_url}\n")
        except Exception as e:
            print(f"Error processing product: {e}")
            continue

    return product_list

# Scrape multiple categories
def scrape_amazon_categories():
    categories = {
        'Baby': 'https://www.amazon.com/s?k=baby',
        'Watches': 'https://www.amazon.com/s?k=watches',
        'Cameras': 'https://www.amazon.com/s?k=cameras',
        'Shoes': 'https://www.amazon.com/s?k=shoes',
        'Electronics': 'https://www.amazon.com/s?k=electronics'
    }

    all_products = []

    for category_name, url in categories.items():
        print(f"Scraping category: {category_name}")
        products = scrape_amazon(url, category_name)
        if products:
            all_products.extend(products)

    return all_products

# Run the scraper
all_products = scrape_amazon_categories()

# Display the results
print("Scraping complete. Products:")
for product in all_products:
    print(f"Category: {product['category']} | Product: {product['name']} | Image URL: {product['image_url']}")


Scraping category: Baby
Status Code: 200
Saved image: images/Baby/Pampers.jpg
Product: Pampers
Image URL: https://m.media-amazon.com/images/I/61Kcg05Sz4L._AC_UL320_.jpg

Saved image: images/Baby/HelloBaby.jpg
Product: HelloBaby
Image URL: https://m.media-amazon.com/images/I/61WoQMri81L._AC_UL320_.jpg

Saved image: images/Baby/The_Honest_Company.jpg
Product: The Honest Company
Image URL: https://m.media-amazon.com/images/I/81W3iWS7ptL._AC_UL320_.jpg

Saved image: images/Baby/Baby_Montessori_Sens.jpg
Product: Baby Montessori Sensory Toys for 0-6 6-12 Months, Food Grade Teething Toys for Babies 0 3 6 9 12 18 Months, Newborn Infant Learning Developmental Toys Gifts for 1 2 Year Old Boys Girls
Image URL: https://m.media-amazon.com/images/I/61hMVdD84nL._AC_UL320_.jpg

Saved image: images/Baby/HUGGIES.jpg
Product: HUGGIES
Image URL: https://m.media-amazon.com/images/I/71NCICgdEcL._AC_UL320_.jpg

Scraping category: Watches
Status Code: 503
Error: Unable to access https://www.amazon.com/s?k=wat