In [8]:
import requests
from bs4 import BeautifulSoup
import os
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense
import matplotlib.pyplot as plt
import pickle
from textblob import TextBlob
import langid
from sqlalchemy import create_engine
import easygui as gui
import seaborn as sns

max_words = 10000
max_seq_length = 100

db_username = "root"
db_password = ""
db_host = "localhost"
db_port = "3306"
db_name = "Title"

engine = create_engine(f"mysql+pymysql://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}")

def custom_tokenizer(text):
    # You can replace this with your custom tokenization logic
    return word_tokenize(text)

def custom_sequence(tokenizer, texts, max_words, max_seq_length):
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post', truncating='post')
    return padded_sequences

def scrape_data_from_tags():
    csv_path = r'D:\Aunik\Research\New\Scraped_new.csv'
    total_limit = 990
    current_total = 0
    unique_titles = set()
    scraping_outputs = []  # Store scraping outputs in memory

    # Check if the CSV file already exists
    if os.path.exists(csv_path):
        # Load existing titles into the set
        existing_df = pd.read_csv(csv_path)
        unique_titles.update(existing_df['Title'])

    with open(r'D:\Aunik\Research\New\tags.json') as data_file:
        tags = json.load(data_file)

    for tag, value in tags.items():
        link = value.get('link')
        if link:
            print("Scraping headlines from", tag, "at", link)
            response = requests.get(link)
            soup = BeautifulSoup(response.content, 'html.parser')

            # Try finding h3 headlines
            headlines = soup.find_all('h3')
            if not headlines:
                # If h3 headlines are missing, try h2, h4, and h5
                for level in ['h2', 'h4', 'h5']:
                    headlines = soup.find_all(level)
                    if headlines:
                        break

            for headline in headlines:
                title = headline.text.strip()
                if title not in unique_titles:
                    scraping_outputs.append([title])
                    unique_titles.add(title)
                    print(current_total + 1, title)
                    current_total += 1
                    if current_total >= total_limit:
                        break

    # Append unique rows to the CSV file using Pandas
    df_to_insert = pd.DataFrame(scraping_outputs, columns=['Title'])
    df_to_insert.drop_duplicates(subset=['Title'], inplace=True)  # Remove duplicates

    # Append only if there are new unique titles
    if not df_to_insert.empty:
        # Use header=True to include the header in the CSV file
        df_to_insert.to_csv(csv_path, mode='a', header=True, index=False)

    print(f"Total unique titles collected: {current_total}")

def translate_to_english(text):
    translator = Translator()
    translation = translator.translate(text, dest='en')
    return translation.text

# Define the detect_language function
def detect_language(text):
    return langid.classify(text)[0]

def insert_data_into_database():
    df = pd.read_csv('G:\\research\\New\\Scraped_new.csv')

    # Replace these with your actual database credentials
    db_username = "root"
    db_password = ""
    db_host = "localhost"
    db_port = "3306"
    db_name = "Title"

    # Translate non-English texts to English
    df['Title'] = df['Title'].apply(lambda x: translate_to_english(x) if detect_language(x) != 'en' else x)

    # Define the sentiment function
    def sentiment(text):
        analysis = TextBlob(text)
        return analysis.sentiment.polarity

    df['sentiment'] = df['Title'].apply(sentiment)

    df['sentiment_class'] = np.where(df['sentiment'] < 0.2, "negative", "positive")
    label_encoder = LabelEncoder()
    df['label_encoded'] = label_encoder.fit_transform(df['sentiment_class'])
    df.to_csv('G:\\research\\New\\labeled_data.csv')

    # Create a new DataFrame with only the required columns
    df_to_insert = df[['Title', 'sentiment_class','label_encoded']]
    df_to_insert = pd.DataFrame(df_to_insert)

    # Remove duplicates
    df_to_insert = df_to_insert.drop_duplicates(subset=['Title', 'sentiment_class', 'label_encoded'])

    engine = create_engine(f"mysql+pymysql://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}")

    # Insert the DataFrame into the MySQL database if it doesn't exist
    df_to_insert.to_sql(name='heading', con=engine, if_exists='append', index=False)

    print("Data inserted successfully.")

def train_model():
    df = pd.read_sql("SELECT * FROM heading", con=engine)

    tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
    tokenizer.fit_on_texts(df['title'])

    sequences = tokenizer.texts_to_sequences(df['title'])

    X = pad_sequences(sequences, maxlen=max_seq_length, padding='post', truncating='post')
    y = df['label_encoded']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    embedding_dim = 100
    lstm_units = 192

    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_seq_length))
    model.add(LSTM(lstm_units, dropout=0.9, recurrent_dropout=0.3))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    history = model.fit(X_train, y_train, validation_split=0.1, epochs=10, batch_size=64)

    model.save('G:\\research\\New\\sentiment_model.h5')

    # Save the Tokenizer using pickle only if it doesn't exist
    tokenizer_file_path = 'G:\\research\\New\\tokenizer.pkl'
    if not os.path.exists(tokenizer_file_path):
        with open(tokenizer_file_path, 'wb') as tokenizer_file:
            pickle.dump(tokenizer, tokenizer_file)
        print("Tokenizer saved successfully.")
    else:
        print("Tokenizer file already exists.")

    print("Model trained and saved successfully.")

def test_model():
    title = input("Enter the title to test the model: ")

    # Load the trained model
    model = load_model('G:\\research\\New\\sentiment_model.h5')

    # Create a new tokenizer
    new_tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
    new_tokenizer.fit_on_texts([title])

    # Tokenize and pad the input title using the new tokenizer
    sequences = custom_sequence(new_tokenizer, [title], max_words, max_seq_length)

    # Predict sentiment
    prediction = model.predict(sequences)[0][0]

    # Convert the prediction to sentiment class
    sentiment_class = "positive" if prediction >= 0.2 else "negative"

    print(f"Predicted Sentiment Class: {sentiment_class}")

def menu():
    title = "Choose an option"
    choices = ["Scrape Data", "Insert into Database", "Train Model", "Test Model", "Visualize Sentiment Distribution", "End"]
    choice = gui.buttonbox("Select an option:", title=title, choices=choices)
    return choice

def visualize_sentiment_distribution():
    df = pd.read_sql("SELECT * FROM heading", con=engine)

    # Check if the DataFrame is not empty
    if not df.empty:
        # Visualize sentiment distribution
        plt.figure(figsize=(8, 6))
        sns.countplot(x='sentiment_class', data=df, palette=["#FF0000", "#0000FF"])
        plt.title('Sentiment Distribution')
        plt.xlabel('Sentiment Class')
        plt.ylabel('Count')
        plt.show()
    else:
        print("The DataFrame is empty. No data to visualize.")

while True:
    user_choice = menu()

    if user_choice == "Scrape Data":
        scrape_data_from_tags()
    elif user_choice == "Insert into Database":
        insert_data_into_database()
    elif user_choice == "Train Model":
        train_model()
    elif user_choice == "Test Model":
        test_model()
    elif user_choice == "Visualize Sentiment Distribution":
        visualize_sentiment_distribution()
    elif user_choice == "End":
        print("Ending the program.")
        break
    else:
        print("Invalid choice. Please choose from the provided options.")


Scraping headlines from washingtonpost at https://www.washingtonpost.com/
1 Senate careens toward failed vote on Ukraine aid as GOP pushes for border measures
2 Special counsel alleges Trump ‘sent’ supporters on path to Jan. 6 violence
3 Israel has vowed to destroy Hamas. Yet the group remains largely intact.
4 Fires devastated their Maui town. Football helped bring them back together.
5 How to reset recommendations in Netflix, YouTube and Prime Video
6 Woman sentenced to fast-food job after hurling Chipotle bowl at worker
7 Has Balenciaga lost its luster?
8 The new pregnancy announcement is no announcement
9 ‘Bombing mishap’ by Nigeria military kills 85 civilians
10 Unexploded bombs, many U.S.-made, could make parts of Gaza uninhabitable
11 Scientists studied twins’ diets. Those who ate vegan saw fast results.
12 Michigan fan sues to keep ‘G0BLUE’ license plate after state reassigned it
13 Young conservatives want the Republican Party to make space for them
14 Senate Democrats press B

125 These two states have surprisingly high nursing home covid vaccination rates
126 To find what society needs, look on the sidelines of kids’ soccer
127 Enough with all the fatalism about a Trump dictatorship
128 Required viewing for fans of Trump’s budding dictatorship
129 Back-seat driver
130 A Trump dictatorship is increasingly inevitable. We should stop pretending.
131 Lily Gladstone and Erica Tremblay on their ‘love letter’ to Native communities
132 The folly of poll-dependent commentary
133 Why abortion is not the silver bullet that Democrats need in 2024
134 Why U.S. aid for Ukraine is a bargain
135 Politicians in both parties need to face up to the national debt
136 Why are we still so fascinated by Napoleon after more than 200 years?
137 For mental health at work, bosses can make it better — or worse
138 What Mehdi Hasan’s cancellation shows about MSNBC
139 The Global South hasn’t forgotten Kissinger
140 How two gun-friendly senators are turning the tide on gun safety
141 Mi

250 Renewables and EVs are soaring. It’s still not enough.
251 Farmers race to innovate as climate change threatens African food supply
252 How a single word could hold up global talks to save the planet
253 Companies made big climate pledges. Now they are balking on delivering.
254 A super solar storm rocked Earth in 1872. They’re more common than you think.
255 A blind but elusive critter that was presumed extinct is rediscovered
256 COP28 live updates: Oil companies make surprise pledge to curb world’s most pressing climate threat
258 Where malaria is spreading
259 The inequality of heat
260 How soon do you have to buy heat pumps and EVs to avoid climate catastrophe?
261 An invisible killer
262 Why you should think twice about cranking up your thermostat as it gets cold
263 The government is giving out money for energy upgrades. What to buy now.
264 A plane fueled by fat and sugar has crossed the Atlantic Ocean
265 An architect has found a way to build flood-proof homes
266 The easy

In [2]:
#pip install nltk

In [8]:
#pip install googletrans==4.0.0-rc1

In [9]:
#pip install easygui

In [3]:
#pip install langid

In [12]:
#pip install pymysql

In [62]:
import requests
from bs4 import BeautifulSoup

#response = requests.get('https://www.nytimes.com/international/')
response = requests.get('https://www.washingtonpost.com/climate-environment/')

#response = requests.get('https://www.thedailystar.net/sports')
#response = requests.get('https://www.washingtonpost.com/')
soup = BeautifulSoup(response.content, 'html.parser')

headlines = soup.find_all('h2')
for headline in headlines:
    print(f'{headline.text}')

Two people were killed by sharks in encounters that are very rare 
The surprisingly simple way to convince people to go green 
Oil, gas and coal interests swarm global climate summit in Dubai
Renewables and EVs are soaring. It’s still not enough.
Farmers race to innovate as climate change threatens African food supply
How a single word could hold up global talks to save the planet
There’s a crisis in the Yukon River
Companies made big climate pledges. Now they are balking on delivering.
A super solar storm rocked Earth in 1872. They’re more common than you think.
A blind but elusive critter that was presumed extinct is rediscovered
COP28 live updates: Oil companies make surprise pledge to curb world’s most pressing climate threat
Where malaria is spreading
The inequality of heat
How soon do you have to buy heat pumps and EVs to avoid climate catastrophe? 
An invisible killer
Farmers race to innovate as climate change threatens African food supply
Why you should think twice about cranki

In [50]:
import requests
from bs4 import BeautifulSoup
import json


def print_headlines(response_text):
    soup = BeautifulSoup(response_text, 'lxml')
    headlines = soup.find_all(attrs={"itemprop": "headline"})
    for headline in headlines:
        print(headline.text)


url = 'https://inshorts.com/en/read'
response = requests.get(url)
print_headlines(response.text)

Gangster Rohit Godara takes responsibility for Karni Sena chief Sukhdev's murder 
Which Indian universities feature in global sustainability rankings?
Video of Karni Sena chief Sukhdev Singh being shot dead at his Jaipur house surfaces
Aamir Khan stuck in Chennai floods, pics of him being rescued on boat surface
Attackers had tea with Karni Sena chief Sukhdev Singh at his house before killing him
