###### ===================================================================================================================

# Web Scraping & Dumping File

###### ===================================================================================================================

### Run Command

In [1]:
%run "C:\\Users\DELL\\Desktop\\Projects\\Wealth Management System\\Python\\General Files\\Libraries.ipynb"



In [2]:
%run "C:\\Users\DELL\\Desktop\\Projects\\Wealth Management System\\Python\\General Files\\SQL_Connection_File.ipynb"



### Class: Web Scraping

In [3]:
class Web_Scraping_Data:
    def __init__(self, base_link, driver_path, sql_connector, table_name):
        """
        Initializes the Mutual_Fund class.

        Args:
            base_link (str): The base URL for the mutual fund data.
            driver_path (str): The path to the Chrome WebDriver executable.
            sql_connector: An SQL connector object for database operations.
            table_name (str): The name of the SQL table to insert data into.
        """
        self.url = base_link 
        self.driver_path = driver_path
        self.sql_connector = sql_connector
        self.table_name = table_name
        
        self.driver = None  # Initialize WebDriver instance
        self.current_date = None
        
    def Initialize_Driver(self):
        """
        Initializes the WebDriver instance.
        """
        service = Service(self.driver_path)
        options = webdriver.ChromeOptions()
        self.driver = webdriver.Chrome(service=service, options=options)
    
    def Close_Driver(self):
        """
        Closes the WebDriver instance.
        """
        if self.driver:
            self.driver.quit() 
        
    def Fetch_Links(self):
        """
        Fetches mutual fund data from a website and returns it as a DataFrame.

        Returns:
            equity_mutual_fund_links (list): A list of equity mutual fund links.
            fund_links (list): A list of fund links.
        """
        try:
            # Initialize the WebDriver
            self.Initialize_Driver()
            driver = self.driver
            
            # Navigate to the equity funds page
            driver.get(self.url)

            # Extract links under "Types of Equity Funds" heading
            equity_mutual_fund_links = []
            # New list to store fund links
            fund_links = []

            market_cap_heading = driver.find_element(By.XPATH, '//strong[contains(text(), "By Market Capitalization")]')
            market_cap_heading.click()  # Expand the section
            market_cap_elements = driver.find_elements(By.XPATH, '//a[contains(@href, "/equity/") and not(contains(@href, "/portfolio-details/"))]')
        
            for element in market_cap_elements:
                equity_mutual_fund_links.append(element.get_attribute("href"))

            # Iterate over the links to find fund links directly
            for link in equity_mutual_fund_links:
                driver.get(link)
            
                # Find all the fund links under this category
                fund_elements = driver.find_elements(By.XPATH, '//div[@class="fundListing.performing-data"]/a[contains(@href, "/mutual-funds/")]')
            
                for fund_element in fund_elements:
                    fund_links.append(fund_element.get_attribute("href"))

                # Extract all links from the current page and add to fund_links
                all_links_on_page = driver.find_elements(By.TAG_NAME, "a")
                links = [a.get_attribute("href") for a in all_links_on_page if a.get_attribute("href")]
                fund_links.extend(links)
        
            return equity_mutual_fund_links, fund_links

        except Exception as e:
            # Handle exceptions here, or at least log them for debugging
            print(f"An error occurred while fetching the Mutual Fund Data: {e}")
        
        finally:
            self.Close_Driver()
            
    def Scroll(self, driver, element):
        """
        Scrolls the web page to a specific element.

        Args:
            driver: The WebDriver instance.
            element: The element to scroll to.
        """
        driver.execute_script("arguments[0].scrollIntoView();", element)

    def Wait(self, driver, by, value, timeout=10):
        """
        Waits for an element to be present on the web page.

        Args:
            driver: The WebDriver instance.
            by: The method to locate the element (e.g., By.XPATH).
            value: The locator value for the element.
            timeout (int): The maximum time to wait for the element (default is 10 seconds).

        Returns:
            The located element.
        """
        return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))

    def Scrape_Data(self, link):
        """
        Scrapes and processes data from a mutual fund portfolio details page.

        Args:
            link (str): The URL of the portfolio details page.

        Returns:
        pd.DataFrame: A DataFrame containing the scraped data.
        """
        try:
            # Initialize the WebDriver
            self.Initialize_Driver()
            driver = self.driver

            # Check if the URL exists and is valid
            response = requests.get(link)

            if response.status_code >= 200 and response.status_code < 300:
                # URL is valid, continue with scraping
                pass
            else:
                # URL is not valid, return an empty DataFrame
                return pd.DataFrame()

            # Visit the portfolio details page
            driver.get(link)

            # Find the Fund Name from the URL
            fund_name = link.split("/")[-3].replace("-", " ")

            # Wait for the "Complete Current Stock Holdings" heading to appear
            complete_holdings_heading = self.Wait(driver, By.XPATH, '//div[@class="heading-4" and contains(text(), "Complete Current Stock Holdings")]')
            
            # Check if the table exists
            if not complete_holdings_heading:
                return pd.DataFrame()
            else:
                pass
            
            # Find the parent div containing the table using the full XPath
            parent_div = self.Wait(driver, By.XPATH, '/html/body/div/div[8]/div[3]/div/div[6]')

            # Scroll to the parent div containing the table
            self.Scroll(driver, parent_div)

            # Find the table within the parent div
            stock_table = parent_div.find_element(By.CLASS_NAME, 'table')

            # Extract the data from the table
            rows = stock_table.find_elements(By.TAG_NAME, 'tr')
            data = []
            for row in rows[1:]:  # Skip the header row
                cols = row.find_elements(By.TAG_NAME, 'td')
                # Extract only the first four columns
                row_data = [col.text.strip() for col in cols[:4]]
                data.append(row_data)

            # Create a pandas DataFrame with specific column names
            dataframe = pd.DataFrame(data, columns=["Stocks", "Sectors", "% of holding", "Value in (Cr)"])

            # Rename columns
            dataframe.rename(columns={
                'Stocks' : 'StockName', 
                'Sectors' : 'Sector', 
                '% of holding' : 'PercentageAUM', 
                'Value in (Cr)' : 'HoldingValue'
            }, inplace=True)
            
            # Clean and convert columns
            def Convert_Columns(column):
                # Clean the column by removing commas, percentages, "Cr", and double hyphens
                cleaned_column = column.str.replace(',', '').str.replace('%', '').str.replace('Cr', '').str.replace('--', '')

                # Convert the cleaned column to float, handling conversion errors by setting to NaN
                cleaned_column = pd.to_numeric(cleaned_column, errors='coerce')

                return cleaned_column

            columns_to_convert = [
                'PercentageAUM',
                'HoldingValue'
            ]

            # Apply the clean_and_convert_column function to the specified columns
            dataframe[columns_to_convert] = dataframe[columns_to_convert].apply(Convert_Columns)

            # Add the Fund Name column
            dataframe['FundName'] = fund_name

            return dataframe

        except Exception as e:
            # Handle exceptions here, or at least log them for debugging
            print(f"Link with inadequate data: ", link)
            return pd.DataFrame()
            
        finally:
            self.Close_Driver()  # Close the WebDriver

        
    def Mutual_Fund_Data(self):
        """
        Executes the mutual fund data scraping process.
        """
        try:
            # Fetch equity mutual fund links and fund links
            equity_mutual_fund_links, fund_links = self.Fetch_Links()
            
            # Filter fund_links to include only links containing "/mutual-funds/"
            fund_links = [link for link in fund_links if "/mutual-funds/" and "-direct-growth/" in link]
    
            # Define a lambda function to modify the links
            modify_link = lambda link: link[:-5] + "portfolio-details/" + link[-5:]
            
            # Apply the lambda function to each link in fund_links
            portfolio_details_fund_links = [modify_link(link) for link in fund_links]
            
            # Define a lambda function to modify specific links
            modify_link = lambda link: link.replace("-direct-growthportfolio-details//", "-direct-growth/portfolio-details/")

            # Apply the lambda function to each link in portfolio_details_fund_links
            portfolio_details_fund_links = [modify_link(link) if "-direct-growthportfolio-details//" in link else link for link in portfolio_details_fund_links]
            
            for link in portfolio_details_fund_links:
                scraped_dataframe = self.Scrape_Data(link)
                
                # Execute the data dumping process
                self.sql_connector.Append_SQL_Table(self.table_name, self.current_date, scraped_dataframe)
            
            print("Executed Successfully!")
        
        except Exception as e:
            # Handle exceptions here, or at least log them for debugging
            print(f"An error occurred during the dumping of the Mutual Fund Data: {e}")

### Execution

In [None]:
t_start = time.time()

# Defining the driver path
Driver_Path = "C:\\Users\\DELL\\Downloads\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe"

# Defining the constructor link
Base_Link = "https://www.etmoney.com/mutual-funds/equity"

# Defining the table name
Mutual_Fund_Data_Table_Name = "MutualFundData"

# Initialize the CSV_Data class instance
Dump = Web_Scraping_Data(Base_Link, Driver_Path, SQL_Connector, Mutual_Fund_Data_Table_Name)

# Execute the data dumping
Dump.Mutual_Fund_Data()

t_end = time.time()

print("\nTime taken to dump the Web Scraped Files: ", round((t_end - t_start) / 3600, 0), " Hours")

Link with inadequate data:  https://www.etmoney.com/mutual-funds/bank-of-india-large-and-mid-cap-equity-fund-direct-growth/portfolio-details/15834
Link with inadequate data:  https://www.etmoney.com/mutual-funds/sundaram-large-and-mid-cap-fund-direct-growth/portfolio-details/15653
Link with inadequate data:  https://www.etmoney.com/mutual-funds/canara-robeco-emerging-equities-fund-direct-growth/portfolio-details/16144
Link with inadequate data:  https://www.etmoney.com/mutual-funds/lic-mf-large-and-mid-cap-fund-direct-growth/portfolio-details/28923
Link with inadequate data:  https://www.etmoney.com/mutual-funds/invesco-india-growth-opportunities-fund-direct-growth/portfolio-details/16333
Link with inadequate data:  https://www.etmoney.com/mutual-funds/franklin-india-equity-advantage-fund-direct-growth/portfolio-details/15553
Link with inadequate data:  https://www.etmoney.com/mutual-funds/aditya-birla-sun-life-equity-advantage-fund-direct-growth/portfolio-details/15264
Link with inade