In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df1=pd.read_csv('entertainment_data.csv/entertainment_data.csv')
df2=pd.read_csv('business_data.csv/business_data.csv')
df3=pd.read_csv('education_data.csv/education_data.csv')
df4=pd.read_csv('sports_data.csv/sports_data.csv')
df5=pd.read_csv('technology_data.csv/technology_data.csv')

In [3]:
# joining all the dataframes
dfs=[df1,df2,df3,df4,df5]
df = pd.concat(dfs)

In [4]:
# Shuffling the columns so the df have some randomness
from sklearn.utils import shuffle
df = shuffle(df)


In [5]:
df.head()

Unnamed: 0,headlines,description,content,url,category
364,I’ve found happiness in even being able to bru...,He talked about how after the life threatening...,India player Rishabh Pant who is recovering fr...,https://indianexpress.com/article/sports/crick...,sports
35,HBSE 2024 Exams: Practical exams for Class 10t...,HBSE 2024 Exams: The BSEH Class 10 and Class 1...,The Haryana School Education Board (BSEH) will...,https://indianexpress.com/article/education/hb...,education
1189,‘I want him to crack JEE but don’t want to los...,"Packed schedule, cut-throat competition, const...",From mothers on sabbatical to grandparents and...,https://indianexpress.com/article/education/i-...,education
478,‘As rare as it gets’: Five wickets fall in las...,Tasmania won the summit clash by one run under...,Five wickets fell in a final over for the ages...,https://indianexpress.com/article/sports/crick...,sports
1392,‘Just because MS Dhoni came and won a World Cu...,"Speaking on his YouTube channel, Ashwin also u...",India all-rounder Ravichandran Ashwin has shar...,https://indianexpress.com/article/sports/crick...,sports


In [6]:
# resetting the index
df.reset_index(inplace=True)

In [7]:
df.head()

Unnamed: 0,index,headlines,description,content,url,category
0,364,I’ve found happiness in even being able to bru...,He talked about how after the life threatening...,India player Rishabh Pant who is recovering fr...,https://indianexpress.com/article/sports/crick...,sports
1,35,HBSE 2024 Exams: Practical exams for Class 10t...,HBSE 2024 Exams: The BSEH Class 10 and Class 1...,The Haryana School Education Board (BSEH) will...,https://indianexpress.com/article/education/hb...,education
2,1189,‘I want him to crack JEE but don’t want to los...,"Packed schedule, cut-throat competition, const...",From mothers on sabbatical to grandparents and...,https://indianexpress.com/article/education/i-...,education
3,478,‘As rare as it gets’: Five wickets fall in las...,Tasmania won the summit clash by one run under...,Five wickets fell in a final over for the ages...,https://indianexpress.com/article/sports/crick...,sports
4,1392,‘Just because MS Dhoni came and won a World Cu...,"Speaking on his YouTube channel, Ashwin also u...",India all-rounder Ravichandran Ashwin has shar...,https://indianexpress.com/article/sports/crick...,sports


In [8]:
df.shape

(10000, 6)

In [9]:
# removing the useless columns like index and url
df.drop(columns=["index","url"],inplace=True)

In [10]:
df['category'].value_counts()

category
sports           2000
education        2000
entertainment    2000
business         2000
technology       2000
Name: count, dtype: int64

In [11]:
# mapping
def map_category(category):
    category_map = {
        'sports': 0,
        'business': 1,
        'entertainment': 2,
        'education': 3,
        'technology': 4
    }
    return category_map.get(category, -1)
df['Category'] = df['category'].apply(map_category)

In [12]:
df.drop(columns=['category'],inplace=True)

In [13]:
df.head()

Unnamed: 0,headlines,description,content,Category
0,I’ve found happiness in even being able to bru...,He talked about how after the life threatening...,India player Rishabh Pant who is recovering fr...,0
1,HBSE 2024 Exams: Practical exams for Class 10t...,HBSE 2024 Exams: The BSEH Class 10 and Class 1...,The Haryana School Education Board (BSEH) will...,3
2,‘I want him to crack JEE but don’t want to los...,"Packed schedule, cut-throat competition, const...",From mothers on sabbatical to grandparents and...,3
3,‘As rare as it gets’: Five wickets fall in las...,Tasmania won the summit clash by one run under...,Five wickets fell in a final over for the ages...,0
4,‘Just because MS Dhoni came and won a World Cu...,"Speaking on his YouTube channel, Ashwin also u...",India all-rounder Ravichandran Ashwin has shar...,0


In [14]:
# joining the columns to create one and then i will drop the columns
df['Content'] = df['headlines'] + ' ' + df['description'] + ' ' + df['content']

df.drop(['headlines', 'description', 'content'], axis=1, inplace=True)

In [15]:
df.head()

Unnamed: 0,Category,Content
0,0,I’ve found happiness in even being able to bru...
1,3,HBSE 2024 Exams: Practical exams for Class 10t...
2,3,‘I want him to crack JEE but don’t want to los...
3,0,‘As rare as it gets’: Five wickets fall in las...
4,0,‘Just because MS Dhoni came and won a World Cu...


In [16]:
total_word_count = 0
for content in df['Content']:
    total_word_count += sum(1 for _ in content.split())
print(total_word_count)

2646524


In [17]:
ps = PorterStemmer()

In [18]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ps.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = " ".join(stemmed_content)
    return stemmed_content

In [19]:
df['Content'] = df['Content'].apply(stemming)

In [20]:
df.head()


Unnamed: 0,Category,Content
0,0,found happi even abl brush teeth everi day ris...
1,3,hbse exam practic exam class th th start feb h...
2,3,want crack jee want lose process worri parent ...
3,0,rare get five wicket fall last australian wome...
4,0,ms dhoni came world cup soon took charg mean h...


In [21]:
# so after pre-proccessing around 1M words that are stop words , numbers etc were removed 
total_word_count = 0
for content in df['Content']:
    total_word_count += sum(1 for _ in content.split())
print(total_word_count)

1580692


In [22]:
X = df['Content']
Y = df['Category']

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train , X_test , Y_train , Y_test  = train_test_split(X, Y, test_size = 0.2 , stratify = Y , random_state = 42)

In [25]:
# Vectorization 
vc = TfidfVectorizer()
X_train = vc.fit_transform(X_train)
X_test = vc.transform(X_test)

In [26]:
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.svm import SVC

In [27]:
model = SVC(verbose = 2)
model.fit(X_train, Y_train)

# Make predictions
Y_pred_cls = model.predict(X_test)

accuracy_cls = accuracy_score(Y_test, Y_pred_cls)

f1_cls = f1_score(Y_test, Y_pred_cls, average='weighted')

report_cls = classification_report(Y_test, Y_pred_cls)
    
# Print the results for classification
print(f"Model: SVM")
print(f"Accuracy: {accuracy_cls:.4f}")
print(f"F1-Score: {f1_cls:.4f}")
print(f"Classification Report:\n{report_cls}")
print("="*50)

[LibSVM]Model: SVM
Accuracy: 0.9845
F1-Score: 0.9846
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       400
           1       0.99      0.97      0.98       400
           2       1.00      1.00      1.00       400
           3       1.00      0.98      0.99       400
           4       0.95      0.98      0.97       400

    accuracy                           0.98      2000
   macro avg       0.98      0.98      0.98      2000
weighted avg       0.98      0.98      0.98      2000



In [28]:
def val_to_category(val):
    category_map = {
        0:'sports',
        1:'business',
        2:'entertainment',
        3:'education',
        4:'technology'
     }
    return category_map.get(val,-1)
    

In [29]:
def make_predictions(headlines,	description, content):
    text = headlines + " "+description +" "+ content
    text = stemming(text)
    text = vc.transform([text])
    val = model.predict(text)
    val = val_to_category(int(val[0]))
    print("News category is : ",val)

In [30]:
make_predictions("kohli got his 50th century","kholi has scored his 50th century today at stadium","the run machine kholi hit another milestone my scoring his 50th odi centry toda at stadium")


News category is :  sports


In [31]:
make_predictions("ambani earns 10M in a hour" ," Mukesh Ambani earnd 10M ruppes in a single hour","Mukesh Ambani the chairman of Reliance Industries Limited has earns around 10M ruppes every single hour this shows the that how much potential in there in indian market")

News category is :  business


In [32]:
make_predictions("Change is NCERT syllabus","","") # as you can see model can even predict with title only

News category is :  education


In [33]:
make_predictions("tennis is a dish","","")

News category is :  sports


In [68]:
make_predictions("marvel is a dish","","")

News category is :  technology


In [87]:
make_predictions("man explode","many die","")

News category is :  technology


In [97]:
make_predictions("book to read","","")

News category is :  education


In [119]:
make_predictions("share went up","","")

News category is :  entertainment
