## Data collection

### Extract each type of news

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By 
from selenium.webdriver.common.keys import Keys 
import time

In [36]:
driver = webdriver.Chrome()
driver.get("https://news.google.com/home?hl=en-IN&gl=IN&ceid=IN:en")
buttons =  driver.find_elements(By.CLASS_NAME,"EctEBd")

heading = ""
data = []

for button in buttons:
    data_n_ini = button.get_attribute("data-n-ini")
        
    try:
        
        if data_n_ini.isdigit() and  8 <= int(data_n_ini) and 13 >= int(data_n_ini) : 
            a = button.find_element(By.TAG_NAME, 'a')
            a.click()
            print("\n",a.text)
            heading = a.text
            time.sleep(5)
            #
            labels = driver.find_elements(By.CLASS_NAME,"gPFEn")
            
            for lab in labels:
                if lab.text != "":
                    # print(lab.text,"\n")
                    data.append({'label':heading,'content':lab.text})
            
    
    except ValueError:
        continue
driver.quit()


 Business 


 Technology 


 Entertainment 


 Sports 


 Science 


 Health 



In [38]:
data[0]

{'label': 'Business',
 'content': 'Union Budget 2025: Electric vehicles to become affordable'}

### Write data into csv

In [39]:
import csv

In [40]:
csv_file = "newsdataset.csv"

with open(csv_file, mode = "w" , newline="", encoding = "utf-8") as file:
    filenames = ['label','content']

    writer = csv.DictWriter(file,fieldnames = filenames)

    writer.writeheader()

    writer.writerows(data)


print("done")

done


## Training

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score


In [3]:
df = pd.read_csv('newsdataset.csv')
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
df.head(5)

Unnamed: 0,label,content,label_encoded
0,Business,Maruti Suzuki's vehicle dispatches from factor...,0
1,Business,Non-tax revenue from telecom pegged 33 pc lowe...,0
2,Health,AI model predicts dengue outbreaks two months ...,2
3,Health,Handful of California Almonds a Day: Natural A...,2
4,Entertainment,Sankranthiki Vasthunnam team plans a grand fin...,1


In [4]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
tokenizer = Tokenizer(num_words = 100,oov_token="<OOV>")

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['label_encoded'], test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (895,)
Test set size: (224,)


In [7]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

In [8]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [9]:
print(f"Training data shape: {X_train_tfidf.shape}")
print(f"Test data shape: {X_test_tfidf.shape}")

Training data shape: (895, 3469)
Test data shape: (224, 3469)


In [10]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
y_pred = nb_classifier.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.8883928571428571
Classification Report:
                precision    recall  f1-score   support

     Business       0.85      0.95      0.90        63
Entertainment       0.93      0.93      0.93        40
       Health       0.88      0.93      0.90        30
      Science       1.00      0.32      0.48        19
       Sports       0.90      0.97      0.94        38
   Technology       0.91      0.91      0.91        34

     accuracy                           0.89       224
    macro avg       0.91      0.84      0.84       224
 weighted avg       0.90      0.89      0.88       224



#### Predict a sample headline

In [14]:
new_headline = " AI product"
new_headline_tfidf = tfidf_vectorizer.transform([new_headline])
predicted_label_encoded = nb_classifier.predict(new_headline_tfidf)
predicted_label = label_encoder.inverse_transform(predicted_label_encoded)
print(f"The predicted category for the headline '{new_headline}' is: {predicted_label[0]}")

The predicted category for the headline ' AI product' is: Technology
