# Web Scrapping Project

Website Name : College Dunia \
WebSite to Scrape: https://collegedunia.com/india-colleges?custom_params=%5Bview%3Atable%5D


Resources : 
1. Python 3.8
2. Selenium v10

In [5]:
# Define Libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from time import time
import csv
import logging
import os

Logger Initilization

In [8]:
logger = logging.getLogger('selenium')

Install and access chrome driver

In [9]:
driver = webdriver.Chrome()

Access Website URL

In [10]:
def access_url():
    try:
        url = 'https://collegedunia.com/india-colleges?custom_params=%5Bview%3Atable%5D'
        driver.get(url)
    except Exception as e:
        logger.error(str(e))

Scrapping college records to store csv file simultaneously 

In [11]:

record_file_path =  "D:\\record_count.txt"

def get_record_count():
    # Check if the record_count.txt file exists
    if os.path.exists(record_file_path):
        # If it exists, read the current record count from the file
        with open(record_file_path, 'r') as file:
            record_count = int(file.read())
        return record_count
    else:
        # If it doesn't exist, start from the beginning (record_count = 0)
        return 0

def update_record_count(count):   
    # Update the record count in the record_count.txt file
    with open(record_file_path, 'w') as file:
        file.write(str(count))

def scrape_data():
    try:
        start_time = time()
        access_url()
        table = driver.find_element(By.TAG_NAME, value="thead")
        entries = table.find_elements(By.TAG_NAME, "tr")

        headers = [th.text for th in entries[0].find_elements(By.TAG_NAME, "th")]
        # store fetched records directly in csv file
        with open('college_dunia.csv', mode='a', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            # Check the record count from the record_count.txt file
            record_count = get_record_count()
            if record_count == 0:
                writer.writerow(headers)
            stored_records = set()
            final_element = "jsx-2796823646.jsx-1342907234.endOfContainer"
            while final_element not in driver.page_source:
                driver.execute_script("arguments[0].scrollIntoView();", driver.find_element(By.CLASS_NAME, final_element))
                driver.implicitly_wait(2)

                # Using XPath to get all <tr> elements in the <tbody>
                entries = driver.find_elements(By.XPATH, "//tbody/tr")

                for entry in entries[record_count:]:
                    data_rows = [td.text.strip().replace("\n", " ") for td in entry.find_elements(By.XPATH, "./td")]
                    data_rows = [row for row in data_rows if row]
                    if data_rows: 
                        record = '\t'.join(data_rows)
                        if record not in stored_records: 
                            stored_records.add(record)
                            if '#' in data_rows[0]: 
                                writer.writerow(data_rows)
                            

                    record_count += 1

                # Update the record count in the record_count.txt file
                update_record_count(record_count)

                if final_element in driver.page_source:
                    stop_time = time()
                    break

        driver.close()
        print("Data Scrapped in data.csv file")
        total_elapsed_time = (stop_time - start_time) / 60
        return total_elapsed_time
    except Exception as e:
        print("Error occurred: ", str(e))
    


In [None]:
# calling the scrape function
total_elapsed_time = scrape_data()
print("Web scrapping sucessfully completed")
print(f"Total scrape Time taken:{total_elapsed_time}")

Storing Data in hadoop in distributed manner

In [1]:
import os
os.environ["SPARK_LOCAL_IP"] = "10.0.2.15"

In [3]:
from pyspark.sql import SparkSession

try:
    # create spark session
    spark = SparkSession.builder.master('local').appName("CollegeDuniaRecords").getOrCreate() 

    input_file_path = '/home/hdoop/Documents/python/python-repo/WebScraping/college_dunia.csv'
    # Read csv
    df = spark.read.format("csv").options(inferSchema="True", sep=",", header="True").load(input_file_path)
    # write csv file to hadoop filesystem
    df.write.format("csv").save("hdfs://localhost:9000/web_scrape_data/college_dunia_records.csv",header=True,inferschema=True)
    
    print("Web scraped data Sucessfully Loaded")
    
except Exception as e:
    logger.error(str(e))

                                                                                