# 🗞️ News Classifier
### - ___Building an Automated News Classification System with NLP Techniques___.
-------------------------------------------------------------------------------------------------------------

## Step 3 :  Extracting News Contents using Headlines url, by web scraping .
 - ### **"Selenium"** is used for scraping data from the web page.

In [5]:
#Importing Necessary Libraries
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, TimeoutException,NoSuchWindowException
from tqdm import tqdm
import math

Connection with MYSQL Database to pull and store the data:

In [6]:
#Importing the mysql library to connect mysql database with python script
import mysql.connector

In [7]:
#Connecting with MYSQL database
try:
    my_db = mysql.connector.connect(
        host="localhost",
        user="root",
        password="123",
        database="news_classifier")
    print("Connected to MySQL database successfully.")
except mysql.connector.Error as err:
    print("Error connecting to MySQL:", err)
#cursor object
my_cursor=my_db.cursor(buffered=True)

Connected to MySQL database successfully.


 - ### Loading the Headlines url data from MYSQL Dataset

In [8]:
category_table_list=["business_headlines_url","education_headlines_url","sports_headlines_url","technology_headlines_url","entertainment_headlines_url"]
#lists to store the urls 
business_url_list=[]
education_url_list=[]
sports_url_list=[]
technology_url_list=[]
entertainment_url_list=[]
#--------------------------------------------------------------------------------------
#Querying the database to retrieve the required data
for table in category_table_list:
          select_query = f"SELECT * FROM {table}"
          my_cursor.execute(select_query)
          #-----------------------------------------------------------------------------
          #lists to store the data
          for row in my_cursor.fetchall():
                    if table=="business_headlines_url":
                         business_url_list.append(row[1])
                    if table=="education_headlines_url":
                         education_url_list.append(row[1])
                    if table=="sports_headlines_url":
                         sports_url_list.append(row[1])
                    if table=="technology_headlines_url":
                         technology_url_list.append(row[1])
                    if table=="entertainment_headlines_url":
                         entertainment_url_list.append(row[1])
          break
#----------------------------------------------------------------------------------------
#Creating dataframes of induvidual categories 
               
#business 
business_headlines_url_df=pd.DataFrame(business_url_list,columns=["url"])
#education
education_headlines_url_df=pd.DataFrame(education_url_list,columns=["url"])
#sports
sports_headlines_url_df=pd.DataFrame(sports_url_list,columns=["url"])
#technology
technology_headlines_url_df=pd.DataFrame(technology_url_list,columns=["url"])
#entertainment
entertainment_headlines_url_df=pd.DataFrame(entertainment_url_list,columns=["url"])

Creating table in "news_classifier" database to store the data :

In [None]:
#--------------------------------------------------------------------------------------------------------------
#Creating a table to store the data
create_table_query = """CREATE TABLE IF NOT EXISTS content(
    id INT AUTO_INCREMENT PRIMARY KEY,
        headline TEXT,
        description TEXT,
        content TEXT,
        url TEXT,
        category VARCHAR(50)
    )
    """
my_cursor.execute(create_table_query)
my_db.commit()
print('"content" table successfully.')

Data Extraction by web scraping :

In [None]:
#Lists to hold the respective scraped data
headlines_list=[]
description_list=[]
content_list=[]
url_list=[]
category_list=[]

In [None]:
#Function for extracting the content from headlines url
def content_extraction(headlines_df,starting_index,ending_index,category):
            #-------------------------------------------------------------------------------
            #Using selenium;s google chrome web driver
            chrome_options = webdriver.ChromeOptions()
            #Turn off the images,gifs,videos to avoid increased loading time, because these informations are not needed
            prefs = {
                    "profile.managed_default_content_settings.images": 2,
                    "profile.managed_default_content_settings.gifs": 2,
                    "profile.managed_default_content_settings.videos": 2
                    }
            chrome_options.add_experimental_option("prefs", prefs)
            #------------------------------------------------------------------------------
            driver = webdriver.Chrome(options=chrome_options)
            print(starting_index,"--",ending_index)
            current_index=starting_index
            for index, url in tqdm(headlines_df.iloc[starting_index:ending_index, :].iterrows(), desc="Processing", total=(ending_index-starting_index)):
                    #Another function for harvesting the data, this function handles exceptions very well and helps in extracting the data efficintly.
                    def harvest_data():
                                nonlocal current_index,category
                                try:
                                    driver.set_page_load_timeout(12)
                                    driver.get(url.iloc[0]);
                                    time.sleep(3)
                                    try : 
                                            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                                            content=driver.find_element(By.CSS_SELECTOR,"#pcl-full-content ")
                                            content_list.append(content.text)
                                            headlines = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div:nth-child(1) > div > h1"))) 
                                            headlines_list.append(headlines.text)
                                            description=driver.find_element(By.CSS_SELECTOR,"div:nth-child(1) > div > h2")
                                            description_list.append(description.text)
                                            url_list.append(url.iloc[0])
                                            category_list.append(category)
                                            #----------------------------------------
                                            print("Iterated_Upto :",current_index)
                                            current_index=int(current_index)+1
                                    except (TimeoutError,TimeoutException):
                                             print("Page Reloaded")
                                             harvest_data() 
                                             pass
                                    except (NoSuchElementException): 
                                            try:
                                                        nocontent=driver.find_element(By.CSS_SELECTOR,"  body > p")
                                                        headlines_list.append(None) 
                                                        description_list.append(None)
                                                        content_list.append(None)
                                                        url_list.append(url.iloc[0])
                                                        category_list.append(category)
                                                        print("no_content,Iterated_Upto :",current_index)
                                            except Exception:
                                                        print("Iterated_Upto :",current_index)
                                                        current_index=int(current_index)
                                                        pass
                                except (TimeoutException) :
                                    print("Exception,reloading...",)
                                    harvest_data() 
                    harvest_data() 
                


In [None]:
#Function to upload the data into MYSQL database
def upload_to_mysql():
            for i in range(len(headlines_list)):
                        insert_query = "INSERT INTO content(headline, description, content, url, category) VALUES (%s, %s, %s, %s, %s) "
                        values = (headlines_list[i], description_list[i], content_list[i], url_list[i], category_list[i])
                        my_cursor.execute(insert_query,values)
                        my_db.commit()

In [None]:
#list of dataframes
df_list=[business_headlines_url_df,education_headlines_url_df,sports_headlines_url_df,technology_headlines_url_df,entertainment_headlines_url_df]
category_name_list=["bussiness","education","sports","technology","entertainment"]
#-------------------------------------------------------------------------------------------------------------------
for df ,category in zip(df_list,category_name_list):
            headlines_df   =df
            starting_index =0
            ending_index   =len(df)
            #---------------------------------------------------------------------------
            #Calling the function to extract the content by headlines url
            content_extraction(headlines_df,starting_index,ending_index,category)
            #---------------------------------------------------------------------------
            #Calling the function to store the Extrcated information in MYSQL Database.
            upload_to_mysql()
            #---------------------------------------------------------------------------
            #Remove the elements the list after uploading 
            headlines_list.clear()
            description_list.clear()
            url_list.clear()
            category_list.clear()
            print(category, "data stored to database sucessfully.")
    