# Creating a Soup Element and getting all the data

In [1]:
# importing libraries
import requests
from bs4 import BeautifulSoup

In [2]:
# declaring url to scrape the data from
url = "https://christuniversity.in"

In [3]:
# scraping the data from the website
response = requests.get(url)

In [4]:
# creating the soup element
soup = BeautifulSoup(response.text, 'html.parser')

# Finding the desired output using soup and pandas library

In [5]:
# Gathering all the link ('a') tags available in the website
links = soup.find_all('a')
print(f"Number of links: {len(links)}")
for link in links:
    print(link.get('href'))

Number of links: 258
https://christuniversity.in/
http://alumni.christuniversity.in/
https://christuniversity.in/careers
https://christuniversity.in/center/C/IQAC
https://christuniversity.in/center/C/International-Students
https://christuniversity.in/center
https://christuniversity.in/naac-menu/main%20campus/general%20departments/accreditation/ODUzNQ==
https://christuniversity-in.translate.goog/?_x_tr_sl=auto&_x_tr_tl=kn&_x_tr_hl=en&_x_tr_pto=wapp
#
#
https://christuniversity.in/history
https://christuniversity.in/logo-and-anthem
https://christuniversity.in/founder
https://christuniversity.in/vision
https://christuniversity.in/vc
https://christuniversity.in/graduate-attributes
https://christuniversity.in/administration
https://christuniversity.in/governance
https://christuniversity.in/disclosures
https://christuniversity.in/articles-by-faculty-in-media
https://christuniversity.in/recognitions-and-awards
https://christuniversity.in/nep-at-christ-(deemed-to-be-university)
https://christu

In [6]:
# Gathering all the image ('img') tags available in the website
images = soup.find_all('img')
print(f"Number of images: {len(images)}")
for image in images:
    print(image.get('src'))

Number of images: 169
https://christuniversity.in/images/logo.jpg
https://christuniversity.in/images/kannada.jpg
https://christuniversity.in/uploads/campus/medium/1546275337_2020-12-05_12-14-28.jpg
https://christuniversity.in/uploads/campus/medium/1656383149_2020-12-05_12-15-03.jpg
https://christuniversity.in/uploads/campus/medium/1850061447_2020-12-05_12-15-59.jpg
https://christuniversity.in/uploads/campus/medium/1453155785_2023-08-17_12-25-20.jpg
https://christuniversity.in/uploads/campus/medium/524190754_2020-12-05_12-16-11.jpg
https://christuniversity.in/uploads/campus/medium/1370604313_2023-06-15_11-54-29.jpg
https://christuniversity.in/uploads/studentlife/thumb/campus-life_20240516121709..jpg
https://christuniversity.in/uploads/studentlife/thumb/img2_20230327020428..jpg
https://christuniversity.in/uploads/studentlife/thumb/library_20240516122412..jpg
https://christuniversity.in/uploads/studentlife/thumb/ncc_20240516115953..jpg
https://christuniversity.in/uploads/studentlife/thumb

In [7]:
# importing pandas library for storing the output in a tabular form
import pandas as pd

In [8]:
# Creating DataFrame using the pandas library
df = pd.DataFrame(columns=['Link Type', 'URL'])

In [9]:
# Adding the values into the dataframe
for link in links:
    df = df._append({'Link Type': 'Link', 'URL': link.get('href')}, ignore_index=True)

for image in images:
    df = df._append({'Link Type': 'Image', 'URL': image.get('src')}, ignore_index=True)

print(df)

    Link Type                                                URL
0        Link                       https://christuniversity.in/
1        Link                 http://alumni.christuniversity.in/
2        Link                https://christuniversity.in/careers
3        Link          https://christuniversity.in/center/C/IQAC
4        Link  https://christuniversity.in/center/C/Internati...
..        ...                                                ...
422     Image  https://christuniversity.in/images/social-icon...
423     Image  https://christuniversity.in/images/social-icon...
424     Image  https://christuniversity.in/images/social-icon...
425     Image  https://christuniversity.in/images/social-icon...
426     Image  https://christuniversity.in/images/social-icon...

[427 rows x 2 columns]


In [10]:
# removing all the null values from the dataframe
df.dropna(inplace=True)

In [11]:
# removing all the values that is not a link or a src
df = df[df['URL'].str.startswith("https://")]
print(df)

    Link Type                                                URL
0        Link                       https://christuniversity.in/
2        Link                https://christuniversity.in/careers
3        Link          https://christuniversity.in/center/C/IQAC
4        Link  https://christuniversity.in/center/C/Internati...
5        Link                 https://christuniversity.in/center
..        ...                                                ...
422     Image  https://christuniversity.in/images/social-icon...
423     Image  https://christuniversity.in/images/social-icon...
424     Image  https://christuniversity.in/images/social-icon...
425     Image  https://christuniversity.in/images/social-icon...
426     Image  https://christuniversity.in/images/social-icon...

[337 rows x 2 columns]


In [12]:
# Calculating the number of links and images in the website
df["Link Type"].value_counts().to_csv("Original_Links_Count.csv")

In [13]:
# removing all the duplicate links from the datframe
df.drop_duplicates( subset=["URL"], inplace=True)
print(df)

    Link Type                                                URL
0        Link                       https://christuniversity.in/
2        Link                https://christuniversity.in/careers
3        Link          https://christuniversity.in/center/C/IQAC
4        Link  https://christuniversity.in/center/C/Internati...
5        Link                 https://christuniversity.in/center
..        ...                                                ...
422     Image  https://christuniversity.in/images/social-icon...
423     Image  https://christuniversity.in/images/social-icon...
424     Image  https://christuniversity.in/images/social-icon...
425     Image  https://christuniversity.in/images/social-icon...
426     Image  https://christuniversity.in/images/social-icon...

[301 rows x 2 columns]


In [14]:
df["Link Type"].value_counts().to_csv("Update_Links_Count.csv")

In [15]:
# Saving the dataframe into a csv file
df.to_csv('Links_Output.csv', index=False)