In [10]:
import warnings
warnings.filterwarnings(action='ignore')
import asyncio
import nest_asyncio
from requests_html import AsyncHTMLSession
from pyppeteer import launch
import pandas as pd
import joblib

# Function to scrape news articles from Hindustan Times
async def scrape_hindustan_times():
    url = 'https://www.hindustantimes.com/'

    # Create an asynchronous HTML session
    session = AsyncHTMLSession()

    # Use async functions when working with the session
    r = await session.get(url)

    # render the HTML to execute JavaScript
    await r.html.arender()

    # Use pyppeteer for scrolling
    browser = await launch()
    page = await browser.newPage()
    await page.goto(url)

    # Scroll down to load more sections
    for i in range(5):  # You may need to adjust the range based on the actual number of scrolls needed
        await page.evaluate("window.scrollBy(0, window.innerHeight);")
        await asyncio.sleep(1)  # Adjust the sleep duration if necessary

    # Extracting article text from different sections
    articles_by_section = []

    # Replace 'leftFixedNav' with the actual class used for the container that encompasses all sections
    for section_container in r.html.find('.leftFixedNav'):
        # Replace 'li' with the actual tag used for each section within the container
        for section in section_container.find('li'):
            section_name = section.find('a', first=True).text.strip()
            section_link = section.find('a', first=True).attrs['href']
            section_articles = await extract_articles_from_section(session, section_link)
            articles_by_section.extend([(article_title, section_name) for article_title in section_articles])

    # Close the session and browser
    await session.close()
    await browser.close()

    # Create a DataFrame from the extracted articles
    df = pd.DataFrame(articles_by_section, columns=['article', 'section'])
    
    return df

# Function to extract articles from a section page
async def extract_articles_from_section(session, section_link):
    r = await session.get(section_link)
    await r.html.arender()

    # Replace 'your_article_selector' with the appropriate selector for the article links on the section pages
    article_elements = r.html.find('h3 a')  # Assuming the article links are within h3 tags

    # Extract article titles
    articles = [article_element.text for article_element in article_elements]

    return articles

# Allow nested asyncio calls in Jupyter notebook
nest_asyncio.apply()

# Run the event loop to execute the asynchronous code
df_result = asyncio.run(scrape_hindustan_times())

Future exception was never retrieved
future: <Future finished exception=NetworkError('Protocol error (Target.sendMessageToTarget): No session with given id')>
pyppeteer.errors.NetworkError: Protocol error (Target.sendMessageToTarget): No session with given id
Future exception was never retrieved
future: <Future finished exception=NetworkError('Protocol error (Target.sendMessageToTarget): No session with given id')>
pyppeteer.errors.NetworkError: Protocol error (Target.sendMessageToTarget): No session with given id
Future exception was never retrieved
future: <Future finished exception=NetworkError('Protocol error (Target.sendMessageToTarget): No session with given id')>
pyppeteer.errors.NetworkError: Protocol error (Target.sendMessageToTarget): No session with given id
Future exception was never retrieved
future: <Future finished exception=NetworkError('Protocol error (Target.sendMessageToTarget): No session with given id')>
pyppeteer.errors.NetworkError: Protocol error (Target.sendMes

In [11]:
# Store the scraped data in a CSV file
df_result.to_csv('scraped_data.csv', index=False)

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming df_result is the DataFrame with 'article' and 'section' columns

# Split the data into training and testing sets
train_data, test_data = train_test_split(df_result, test_size=0.2, random_state=42)

# Feature extraction using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(train_data['article'])
X_test = vectorizer.transform(test_data['article'])

# Create and train the text classification model (Random Forest Classifier)
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, train_data['section'])

# Save the trained model
joblib.dump(classifier, 'text_classifier_model.joblib')

# Predictions on the test dataset
predictions = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(test_data['section'], predictions)
classification_rep = classification_report(test_data['section'], predictions)

print(f'Accuracy: {accuracy:.2f}')
print('\nClassification Report:\n', classification_rep)

# Create a DataFrame for test evaluation
eval_df = pd.DataFrame({
    'Actual': test_data['section'],
    'Predicted': predictions
})

# Save the evaluation report to a CSV file
eval_df.to_csv('evaluation_report.csv', index=False)

Accuracy: 0.24

Classification Report:
                precision    recall  f1-score   support

    Astrology       1.00      0.75      0.86         4
       Cities       1.00      0.50      0.67         6
      Cricket       0.67      0.25      0.36         8
   Editorials       1.00      0.10      0.18        10
    Education       0.14      0.88      0.25         8
Entertainment       0.00      0.00      0.00         5
         Home       0.00      0.00      0.00         8
        India       0.25      0.20      0.22         5
  Latest News       0.00      0.00      0.00         8
    Lifestyle       0.00      0.00      0.00         4
        World       0.00      0.00      0.00         5

     accuracy                           0.24        71
    macro avg       0.37      0.24      0.23        71
 weighted avg       0.39      0.24      0.21        71

