In [None]:

import numpy as np
import pandas as pd
import os
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:

pd.set_option('display.max_rows', 700)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
data=pd.read_csv('Articles.csv',encoding='ISO-8859-1')


In [None]:
print('Number of rows: ',data.shape[0])
print('Number of columns: ',data.shape[1])

Number of rows:  2692
Number of columns:  4


In [None]:
# Printing columns in the data sets
print(data.columns)

Index(['Article', 'Date', 'Heading', 'NewsType'], dtype='object')


In [None]:
print(data['NewsType'].value_counts())

NewsType
sports      1408
business    1284
Name: count, dtype: int64


In [None]:

# Printing some info related to the dataset
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2692 entries, 0 to 2691
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Article   2692 non-null   object
 1   Date      2692 non-null   object
 2   Heading   2692 non-null   object
 3   NewsType  2692 non-null   object
dtypes: object(4)
memory usage: 84.3+ KB
None


In [None]:

# removing the date columns since it is not a useful feature to classify articles
df=data.drop(columns=['Date'])

In [None]:
df.head()

Unnamed: 0,Article,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,us oil prices slip below 50 a barr,business


In [None]:

'''This cell collects words in a row of two columns, heading and article, creates a word list
and stores it in 'words' list variable'''
words=list()
arr=[]
for i,row in df.iterrows():
    temp=row['Heading']+' '+row['Article']
    words.append(temp.split(' '))

In [None]:

print('\'words\' variable contains for example: ')
print(words[0])

'words' variable contains for example: 
['sindh', 'govt', 'decides', 'to', 'cut', 'public', 'transport', 'fares', 'by', '7pc', 'kti', 'rej', 'KARACHI:', 'The', 'Sindh', 'government', 'has', 'decided', 'to', 'bring', 'down', 'public', 'transport', 'fares', 'by', '7', 'per', 'cent', 'due', 'to', 'massive', 'reduction', 'in', 'petroleum', 'product', 'prices', 'by', 'the', 'federal', 'government,', 'Geo', 'News', 'reported.Sources', 'said', 'reduction', 'in', 'fares', 'will', 'be', 'applicable', 'on', 'public', 'transport,', 'rickshaw,', 'taxi', 'and', 'other', 'means', 'of', 'traveling.Meanwhile,', 'Karachi', 'Transport', 'Ittehad', '(KTI)', 'has', 'refused', 'to', 'abide', 'by', 'the', 'government', 'decision.KTI', 'President', 'Irshad', 'Bukhari', 'said', 'the', 'commuters', 'are', 'charged', 'the', 'lowest', 'fares', 'in', 'Karachi', 'as', 'compare', 'to', 'other', 'parts', 'of', 'the', 'country,', 'adding', 'that', '80pc', 'vehicles', 'run', 'on', 'Compressed', 'Natural', 'Gas', '(CNG

In [None]:
# doing some necessary cleaning in the 'words' list
for i in range(len(words)):
    for j in range(len(words[i])):
        words[i][j]=words[i][j].replace(':','')
        if not words[i][j].isalpha():
            words[i][j]=''

In [None]:

# counting words and storing it in a dictionary format: 'word':'occurence number'
words_dict=Counter()
for i in range(len(words)):
    words_dict+=Counter(words[i])

In [None]:

type(Counter(words[0]))

In [None]:

del words_dict['']
#deleting dictionary key where key is ''

In [None]:

len(words_dict)
# words_dict contains 25494 key-value pairs

25494

In [None]:

# Taking out most common 3500 words out of 25494
# we will use these 3500 words to train our model
words_dict=words_dict.most_common(3500)

In [None]:
# feature engineering
features=[]
for i in range(len(words)):
    t=words[i]
    data=[]
    for i in words_dict:
        data.append(t.count(i[0]))
    features.append(data)

In [None]:
# Dependent variable 'x'
x=np.array(features)

In [None]:

x[0]

array([5, 5, 4, ..., 0, 0, 0])

In [None]:
x.shape

(2692, 3500)

In [None]:

# Since we need to predict the class of the article, NewsType will be our target variable
df['NewsType']=df['NewsType'].replace({'sports':0,'business':1})

target=df['NewsType'].iloc[:].values

  df['NewsType']=df['NewsType'].replace({'sports':0,'business':1})


In [None]:

target.shape

(2692,)

In [None]:

# Target Variable
y=np.array(target)

In [None]:
classifier=MultinomialNB()

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=7)



In [None]:

classifier.fit(x_train,y_train)


In [None]:

y_pred=classifier.predict(x_test)

In [None]:

accuracy_score(y_test,y_pred)*100

99.25788497217069

In [None]:

def checker(heading,body):
    """Pass a heading, and body of the article to use this function.
        Returns whether article is sports or business."""
    temp=heading+' '+body
    t=temp.split(' ')
    data=[]
    for i in words_dict:
        data.append(t.count(i[0]))
    ans=classifier.predict(np.array(data).reshape(1,3500))[0]
    if ans==0:
        return 'sports'
    if ans==1:
        return 'business'

In [46]:
heading1="""Lionel Messi Wins 7th Ballon d'Or Award"""
body1="""Lionel Messi has won his 7th Ballon d'Or award, surpassing the record previously held by Cristiano Ronaldo. This achievement solidifies Messi's position as one of the greatest soccer players of all time. The award ceremony took place in Paris on December 12, 2021."""

In [47]:
heading2="""Tiger Woods Returns to Competitive Golf"""
body2="""Tiger Woods is set to return to competitive golf after a lengthy injury layoff. The 15-time major champion has been recovering from a car accident and subsequent surgeries. """

In [48]:
ans=checker(heading2,body2)

In [49]:
ans

'sports'