In [1]:
from SportsScrapper import BCCI_Scrapper, ICC_Scrapper, Indian_Athletes_Scrapper
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from textblob import TextBlob
import numpy as np

sentiment_pipeline = pipeline('sentiment-analysis')
political_influence_model = pipeline(
    'text-classification', model='typeform/distilbert-base-uncased-mnli')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


def bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt',
                       max_length=512, truncation=True, padding=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings


def search_official(query, player_type, player_platform, search_type):
    data = []
    if search_type == 'bcci':
        scrapper = BCCI_Scrapper()
        response = scrapper.get_player_data(
            query, player_platform, player_type)
        data.extend(response['Response'])
    elif search_type == 'icc':
        scrapper = ICC_Scrapper()
        response = scrapper.get_player_data(query)
        data.extend(response['Response'])
    elif search_type == 'indian_athletes':
        scrapper = Indian_Athletes_Scrapper()
        response = scrapper.get_player_data(query)
        data.extend(response['Response'])
    else:
        scrapper = BCCI_Scrapper()
        response = scrapper.get_player_data(
            query, player_platform, player_type)
        data.extend(response['Response'])
        scrapper = ICC_Scrapper()
        response = scrapper.get_player_data(query)
        data.extend(response['Response'])
        scrapper = Indian_Athletes_Scrapper()
        response = scrapper.get_player_data(query)
        data.extend(response['Response'])
    return data

def search_unofficial(query):
    query = query.lower()
    query = query.replace(' ', '-')
    url = 'https://newsapi.org/v2/everything?'
    parameters = {
        'q': query,
        'apiKey': '399a3fe0b00b4bbfa2188e79abdc5b8b',
        'sources': 'the-times-of-india,the-hindu,hindustan-times,the-indian-express,news18,ndtv,india-today,zee-news,abp-news,india-tv,republic-world,the-quint,the-wire,scroll,the-print',
    }
    response = requests.get(url, params=parameters)
    data = response.json()
    return data['articles']


def assess_truth(unofficial_data, official_data):
    truth_values = []
    vectorizer = TfidfVectorizer()
    for article in unofficial_data:
        unofficial_text = f"{article['title']} {article['description']}"
        similarity_scores = []

        for official_article in official_data:
            official_text = f"{official_article['title']} {official_article['player_name']}"
            vectors = vectorizer.fit_transform(
                [official_text, unofficial_text])
            similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
            similarity_scores.append(similarity)
        truth_value = max(similarity_scores)
        if article['source']['name'] in ['The Times of India', 'The Hindu', 'Hindustan Times', 'The Indian Express', 'News18', 'NDTV', 'India Today', 'Zee News', 'ABP News', 'India TV', 'Republic World', 'The Quint', 'The Wire', 'Scroll', 'The Print']:
            truth_value = min(truth_value + 0.4, 1)
        truth_values.append(truth_value)
    return truth_values


def detect_influence(article):
    content = article['content']
    sentiment_result = sentiment_pipeline(content)
    emotional_influence = sentiment_result[0]['label'] in [
        'NEGATIVE', 'POSITIVE']
    political_result = political_influence_model(content)
    political_influence = any(
        label['label'] == 'POLITICS' and label['score'] > 0.5 for label in political_result)
    return political_influence, emotional_influence


def cluster_articles(articles):
    contents = [article['content'] for article in articles]
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(contents)

    true_k = 5
    model = KMeans(n_clusters=true_k, random_state=42)
    model.fit(X)

    labels = model.labels_
    cluster_dict = {i: [] for i in range(true_k)}
    for idx, label in enumerate(labels):
        cluster_dict[label].append(articles[idx])

    return cluster_dict


def sentiment_analysis(article):
    analysis = TextBlob(article['content'])
    polarity = analysis.sentiment.polarity
    subjectivity = analysis.sentiment.subjectivity
    if polarity > 0:
        sentiment = 'Positive, which means the text is expressing positive emotions or opinions'
    elif polarity < 0:
        sentiment = 'Negative, which means the text is expressing negative emotions or opinions'
    else:
        sentiment = 'Neutral, which means the text is neither positive nor negative'
    if subjectivity >= 0.5:
        objectivity = 'Subjective, which means the text is based on opinions or beliefs'
    else:
        objectivity = 'Objective, which means the text is based on facts or evidence'

    return {
        'polarity': polarity,
        'polarity_label': sentiment,
        'subjectivity': subjectivity,
        'subjectivity_label': objectivity
    }


def relevance_score(article, query):
    title = article['title']
    description = article['description']
    content = article['content']
    combined_text = f"{title} {description} {content}"
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([query, combined_text])
    tfidf_similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
    query_embedding = bert_embedding(query)
    text_embedding = bert_embedding(combined_text)
    bert_similarity = cosine_similarity(query_embedding, text_embedding)[0][0]
    return {
        'tfidf_similarity': tfidf_similarity,
        'bert_similarity': bert_similarity
    }


def search(query, player_type, player_platform, type):
    official_data = search_official(query, player_type, player_platform, type)
    unofficial_data = search_unofficial(query)

    truth_values = assess_truth(unofficial_data, official_data)
    influences = [detect_influence(article) for article in unofficial_data]
    clustered_articles = cluster_articles(unofficial_data)
    sentiments = [sentiment_analysis(article) for article in unofficial_data]
    relevance_scores = [relevance_score(article, query)
                        for article in unofficial_data]

    for i, article in enumerate(unofficial_data):
        article['truth_value'] = truth_values[i]
        article['political_influence'], article['emotional_influence'] = influences[i]
        article['sentiment_polarity'] = sentiments[i]['polarity']
        article['sentiment_polarity_label'] = sentiments[i]['polarity_label']
        article['sentiment_subjectivity'] = sentiments[i]['subjectivity']
        article['sentiment_subjectivity_label'] = sentiments[i]['subjectivity_label']
        article['relevance_score_tfidf'] = relevance_scores[i]['tfidf_similarity']
        article['relevance_score_bert'] = relevance_scores[i]['bert_similarity']

    unofficial_data = sorted(unofficial_data, key=lambda x: x['truth_value'], reverse=True)

    result = {
        'official_data': official_data,
        'unofficial_data': unofficial_data,
        'clusters': clustered_articles,
    }
    return result

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` f

In [2]:
import json
class CustomEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.float32):
            return float(obj)
        return super().default(obj)

In [3]:
result = search('virat kohli' , 'men', 'international', 'bcci')
print(json.dumps(result, cls=CustomEncoder, indent=2))

{
  "official_data": [
    {
      "title": "Terrific Running Catch ft. Virat Kohli",
      "date": "17th Jan, 2024",
      "views": 20300,
      "platform": "international",
      "type": "men",
      "player_name": "virat kohli",
      "image_url": "https://bcciplayerimages.s3.ap-south-1.amazonaws.com/resizedimageskirti/3588749423001/379f20f7-9221-4dd7-892e-84fdbe579024/2d8f8913-98dc-473b-a5ec-767bfdf1ddc7/1280x720/match/image_compress.jpeg",
      "link": "https://www.bcci.tv/bccilink/videos/3sb2iQND",
      "sport": "Cricket"
    },
    {
      "title": "IND vs AFG 2024, 3RD T20I: Virat Kohli Wicket",
      "date": "17th Jan, 2024",
      "views": 5500,
      "platform": "international",
      "type": "men",
      "player_name": "virat kohli",
      "image_url": "https://bcciplayerimages.s3.ap-south-1.amazonaws.com/resizedimageskirti/3588749423001/24ee868a-1376-4441-a432-30bd036586a4/4c00e264-d9ba-4762-a725-9f1c9216c27f/1280x720/match/image_compress.jpeg",
      "link": "https://ww

In [65]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os


class Investopedia_Scrapper:
    def __init__(self):
        self.url = "https://www.investopedia.com/search?"
        self.query = None
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                          '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

    def get_query_data(self, query):
        processed_query = query.replace(" ", "+")

        if not processed_query:
            return {"Error": "Invalid input"}

        self.query = processed_query

        url = self.url + "q=" + processed_query

        response = self.get_data(processed_query, url)

        if response:
            return {"Response": response}
        else:
            return {"Error": "No data found"}

    def get_data(self, query, url):
        try:
            response = self.session.get(url)
            response.raise_for_status()
        except requests.RequestException as e:
            return {"Error": str(e)}

        soup = BeautifulSoup(response.text, 'html.parser')
        data = []

        query_data = soup.find(
            'div', class_='comp search-results__results mntl-block')
        if query_data:
            cards = query_data.find_all(
                'div', class_='comp search-results__list mntl-block')
            for card in cards:
                title = ""
                desc = ""
                date = ""
                author = ""
                author_link = ""
                reviewer = ""
                reviewer_link = ""
                fact_checker = ""
                fact_checker_link = ""
                content = ""
                link = ""
                keypoints = ""
                category = "Finance"

                link_holder = card.find('a')
                link = link_holder['href']
                title_holder = link_holder.find('h3')
                title = title_holder.text
                desc_holder = card.find(
                    'div', class_='comp search-results__description mntl-text-block')
                desc = desc_holder.text

                try:
                    home_page = self.session.get(link)
                    home_page.raise_for_status()
                    home_soup = BeautifulSoup(home_page.text, 'html.parser')

                    base_content_holder = home_soup.find(
                        'div', class_='loc article-content')
                    if base_content_holder:
                        sec_content_holder = base_content_holder.find(
                            'div', class_='comp article-body mntl-block')
                        if sec_content_holder:
                            content_holder = sec_content_holder.find(
                                'div', class_='comp mntl-sc-page mntl-block article-body-content')
                            if content_holder:
                                paragraphs = content_holder.find_all('p')
                                content = ""
                                for p in paragraphs:
                                    content += p.text + " "
                                main_keypoints = content_holder.find(
                                    'div', class_='comp mntl-sc-block finance-sc-block-callout mntl-block')
                                if main_keypoints:
                                    sec_keypoints = main_keypoints.find(
                                        'div', class_='comp mntl-sc-block mntl-sc-block-callout mntl-block theme-whatyouneedtoknow')
                                    if sec_keypoints:
                                        tert_keypoints = sec_keypoints.find(
                                            'div', class_='comp mntl-sc-block-callout-body mntl-text-block')
                                        if tert_keypoints:
                                            quad_keypoints = tert_keypoints.find(
                                                'ul')
                                            if quad_keypoints:
                                                keypoints = quad_keypoints.find_all(
                                                    'li')
                                                if keypoints:
                                                    keys = ""
                                                    for key in keypoints:
                                                        keys += "* " + key.text + " "
                                                    keypoints = keys
                    main_header_holder = home_soup.find(
                        'div', class_='loc article-pre-content')
                    if main_header_holder:
                        header_holder = main_header_holder.find(
                            'header', class_='comp article-header mntl-block right-rail__offset js-toc-appear')
                        if header_holder:
                            base_meta_holder = header_holder.find(
                                'div', class_='comp article-meta mntl-block')
                            if base_meta_holder:
                                meta_holder = base_meta_holder.find(
                                    'div', class_='comp finance-bylines mntl-bylines')
                                if meta_holder:
                                    base_author_and_date = meta_holder.find(
                                        'div', class_='comp mntl-bylines__group mntl-block mntl-bylines__group--author')
                                    if base_author_and_date:
                                        base_author_holder = base_author_and_date.find(
                                            'div', class_='comp mntl-bylines__item mntl-attribution__item mntl-attribution__item--has-date')
                                        if base_author_holder:
                                            sec_author_holder = base_author_holder.find(
                                                'div')
                                            if sec_author_holder:
                                                author_holder = sec_author_holder.find(
                                                    'a')
                                                if author_holder:
                                                    author = author_holder.text
                                                    author_link = author_holder['href']
                                        date_holder = base_author_and_date.find(
                                            'div', class_='mntl-attribution__item-date')
                                        if date_holder:
                                            date = date_holder.text
                                    base_reviewer_holder = meta_holder.find(
                                        'div', class_='comp mntl-bylines__group mntl-block mntl-bylines__group--finance_reviewer')
                                    if base_reviewer_holder:
                                        sec_reviewer_holder = base_reviewer_holder.find(
                                            'div', class_='comp mntl-bylines__item mntl-attribution__item')
                                        if sec_reviewer_holder:
                                            tert_reviewer_holder = sec_reviewer_holder.find(
                                                'div')
                                            if tert_reviewer_holder:
                                                quad_reviewer_holder = tert_reviewer_holder.find(
                                                    'a')
                                                if quad_reviewer_holder:
                                                    reviewer = quad_reviewer_holder.text
                                                    reviewer_link = quad_reviewer_holder['href']
                                    base_fact_checker_holder = meta_holder.find(
                                        'div', class_='comp mntl-bylines__group mntl-block mntl-bylines__group--fact_checker')
                                    if base_fact_checker_holder:
                                        sec_fact_checker_holder = base_fact_checker_holder.find(
                                            'div', class_='comp mntl-bylines__item mntl-attribution__item')
                                        if sec_fact_checker_holder:
                                            tert_fact_checker_holder = sec_fact_checker_holder.find(
                                                'div')
                                            if tert_fact_checker_holder:
                                                quad_fact_checker_holder = tert_fact_checker_holder.find(
                                                    'a')
                                                if quad_fact_checker_holder:
                                                    print("Here")
                                                    fact_checker = quad_fact_checker_holder.text
                                                    fact_checker_link = quad_fact_checker_holder['href']
                except requests.RequestException as e:
                    return {"Error": str(e)}

                data.append({
                    "title": title,
                    "description": desc,
                    "date": date,
                    "author": author,
                    "author_link": author_link,
                    "reviewer": reviewer,
                    "reviewer_link": reviewer_link,
                    "fact_checker": fact_checker,
                    "fact_checker_link": fact_checker_link,
                    "content": content,
                    "link": link,
                    "keypoints": keypoints,
                    "query": query,
                    "category": category
                })

            if data:
                return data
            else:
                return {"Error": "No data found"}
        else:
            return {"Error": "No data found"}

In [66]:
query = 'Mukesh Ambani'
scrapper = Investopedia_Scrapper()
response = scrapper.get_query_data(query)
print(response)

Here
Here
Here
{'Response': [{'title': 'Who Is Mukesh Ambani? ', 'description': 'Mukesh Ambani became one of the richest people in the world by inheriting Reliance Industries and helping to turn it into a market leader across several industries.', 'date': 'Updated March 18, 2024', 'author': 'Ravi Srikant', 'author_link': 'https://www.investopedia.com/contributors/53585/', 'reviewer': 'Gordon Scott', 'reviewer_link': 'https://www.investopedia.com/contributors/82594/', 'fact_checker': '', 'fact_checker_link': '', 'content': " The richest man in India and, for a brief time in 2008, a contender for the title of the richest man in the world, Mukesh Ambani is the chair and managing director of Reliance Industries, a sprawling Indian conglomerate with interests in refining, oil and gas, petrochemicals, telecoms, retail, and media.\n  Ambani had a net worth of $113.9 billion as of March 2024. Most of this comes from his 42% stake in Reliance Industries, which claims to be the largest oil refin

In [67]:
import json 

# print response in json format

print(json.dumps(response, indent=2))


{
  "Response": [
    {
      "title": "Who Is Mukesh Ambani? ",
      "description": "Mukesh Ambani became one of the richest people in the world by inheriting Reliance Industries and helping to turn it into a market leader across several industries.",
      "date": "Updated March 18, 2024",
      "author": "Ravi Srikant",
      "author_link": "https://www.investopedia.com/contributors/53585/",
      "reviewer": "Gordon Scott",
      "reviewer_link": "https://www.investopedia.com/contributors/82594/",
      "fact_checker": "",
      "fact_checker_link": "",
      "content": " The richest man in India and, for a brief time in 2008, a contender for the title of the richest man in the world, Mukesh Ambani is the chair and managing director of Reliance Industries, a sprawling Indian conglomerate with interests in refining, oil and gas, petrochemicals, telecoms, retail, and media.\n  Ambani had a net worth of $113.9 billion as of March 2024. Most of this comes from his 42% stake in Relianc

In [1]:
import requests
import json

base_url = 'https://api.goperigon.com/v1/all?apiKey=20504987-ba9f-48f7-afc1-1fb20ce0849d&q='
query = 'BMC Introduces Comprehensive Policy After Ghatkopar Hoarding Collapse Tragedy'

response = requests.get(base_url + query)
data = response.json()
print(json.dumps(data, indent=2))

{
  "status": 200,
  "numResults": 10000,
  "articles": [
    {
      "url": "https://www.freepressjournal.in/mumbai/mumbai-bmc-introduces-comprehensive-policy-after-ghatkopar-hoarding-collapse-tragedy",
      "authorsByline": "SHEFALI PARAB-PANDIT",
      "articleId": "aab81e49ae654c1a9bdbc291a5b756ba",
      "clusterId": "a6c2a099e8e54101ad2415d8fd9bb221",
      "source": {
        "domain": "freepressjournal.in",
        "location": null
      },
      "imageUrl": "https://media.assettype.com/freepressjournal/2024-05/051d5e01-d5bc-43ed-a5ff-0ce2aee7e9ae/Untitled_design__2_.jpg",
      "country": "in",
      "language": "en",
      "pubDate": "2024-06-14T18:37:11+00:00",
      "addDate": "2024-06-14T18:43:26.546943+00:00",
      "refreshDate": "2024-06-14T18:43:26.546945+00:00",
      "score": 381.59982,
      "title": "Mumbai: BMC Introduces Comprehensive Policy After Ghatkopar Hoarding Collapse Tragedy",
      "description": "The BMC's new draft policy for hoardings proposes to fix