# Building a Simple Web Scraper

## Fetching the Web Page

### Import Libraries:

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


### Fetch the Web Page:

In [2]:
url = 'https://google.com'
response = requests.get(url)
if response.status_code == 200:
    print("Page fetched successfully")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


Page fetched successfully


### Parse the HTML Content:

In [3]:
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    print(soup.prettify())  # Print the parsed HTML in a readable format


<!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/WebPage" lang="en-IN">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"/>
  <title>
   Google
  </title>
  <script nonce="Dkkl3NeQcr-hdl-DOs87YA">
   (function(){var _g={kEI:'I95yZrLeN-_m2roPtoCvoA4',kEXPI:'0,3700252,1103,26,8,58,2,496821,10216,31559,2872,2891,8349,3405,61296,34266,60058,29268,6621,49751,2,16737,23024,6700,41942,54828,2912,2,2,1,26632,8155,8861,14489,22436,9779,62658,33565,23033,16580,3030,15816,1804,7734,18674,22306,40385,1883,14094,5211785,890,623,39,5991769,2840246,60,5,106,16,23,9,7,2,2,12,7,2,66,27981572,16672,43887,3,318,4,1281,3,2124363,23029351,6870,2,1291,10336,2709,8027,8079,2,558,28025,36870,10511,2370,4832,1575,13845,12953,2212,7981,200,390,5539,4879,7068,2839,26880,1835,10085,7766,3821,2095,2703,1954,155,2,397,2085,1252,12252,7735,738,1,4534,3,1322,2,2540,761,206,3632,20

## Extracting Data

### Extract Text Content:

In [4]:
text_data = [paragraph.get_text() for paragraph in soup.find_all('p')]
print("Text Data:", text_data)


Text Data: ['© 2024 - Privacy - Terms']


### Extract Hyperlinks:

In [6]:
link_data = [link.get('href') for link in soup.find_all('a')]
print("Link Data:", link_data)


Link Data: ['https://www.google.com/imghp?hl=en&tab=wi', 'https://maps.google.co.in/maps?hl=en&tab=wl', 'https://play.google.com/?hl=en&tab=w8', 'https://www.youtube.com/?tab=w1', 'https://news.google.com/?tab=wn', 'https://mail.google.com/mail/?tab=wm', 'https://drive.google.com/?tab=wo', 'https://www.google.co.in/intl/en/about/products?tab=wh', 'http://www.google.co.in/history/optout?hl=en', '/preferences?hl=en', 'https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=https://www.google.com/&ec=GAZAAQ', '/advanced_search?hl=en-IN&authuser=0', 'https://www.google.com/setprefs?sig=0_Pi2CIamSrEne3LEuj9OOboxYrz8%3D&hl=hi&source=homepage&sa=X&ved=0ahUKEwjy-93w4-eGAxVvs1YBHTbAC-QQ2ZgBCAY', 'https://www.google.com/setprefs?sig=0_Pi2CIamSrEne3LEuj9OOboxYrz8%3D&hl=bn&source=homepage&sa=X&ved=0ahUKEwjy-93w4-eGAxVvs1YBHTbAC-QQ2ZgBCAc', 'https://www.google.com/setprefs?sig=0_Pi2CIamSrEne3LEuj9OOboxYrz8%3D&hl=te&source=homepage&sa=X&ved=0ahUKEwjy-93w4-eGAxVvs1YBHTbAC-QQ2ZgBCAg', 'ht

### Extract Image Sources:

In [7]:
image_data = [image.get('src') for image in soup.find_all('img')]
print("Image Data:", image_data)


Image Data: ['/images/branding/googlelogo/1x/googlelogo_white_background_color_272x92dp.png']


## Storing Data in a Structured Format

### Store Extracted Data in a Pandas DataFrame:

In [14]:
# Find the maximum length among the lists
max_length = max(len(text_data), len(link_data), len(image_data))

    # Pad the lists to make them of equal length
text_data.extend([None] * (max_length - len(text_data)))
link_data.extend([None] * (max_length - len(link_data)))
image_data.extend([None] * (max_length - len(image_data)))

    # Store data in a Pandas DataFrame
data = {
        'Text': text_data,
        'Links': link_data,
        'Images': image_data
    }

df = pd.DataFrame(data)
print(df.head())  # Display the first few rows of the DataFrame


                       Text                                        Links  \
0  © 2024 - Privacy - Terms    https://www.google.com/imghp?hl=en&tab=wi   
1                      None  https://maps.google.co.in/maps?hl=en&tab=wl   
2                      None        https://play.google.com/?hl=en&tab=w8   
3                      None              https://www.youtube.com/?tab=w1   
4                      None              https://news.google.com/?tab=wn   

                                              Images  
0  /images/branding/googlelogo/1x/googlelogo_whit...  
1                                               None  
2                                               None  
3                                               None  
4                                               None  


### Save Data to CSV File:

In [15]:
df.to_csv('scraped_data.csv', index=False)
print("Data saved to scraped_data.csv")


Data saved to scraped_data.csv


### Accessing the CSV file

In [16]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('scraped_data.csv')

# Display the first few rows of the DataFrame
print(df.head())

# Perform any other data manipulation or analysis
# For example, filtering rows where 'Text' is not None
filtered_df = df[df['Text'].notna()]
print(filtered_df.head())


                       Text                                        Links  \
0  © 2024 - Privacy - Terms    https://www.google.com/imghp?hl=en&tab=wi   
1                       NaN  https://maps.google.co.in/maps?hl=en&tab=wl   
2                       NaN        https://play.google.com/?hl=en&tab=w8   
3                       NaN              https://www.youtube.com/?tab=w1   
4                       NaN              https://news.google.com/?tab=wn   

                                              Images  
0  /images/branding/googlelogo/1x/googlelogo_whit...  
1                                                NaN  
2                                                NaN  
3                                                NaN  
4                                                NaN  
                       Text                                      Links  \
0  © 2024 - Privacy - Terms  https://www.google.com/imghp?hl=en&tab=wi   

                                              Images  
0  /image