In [1]:
import pandas as pd
import numpy as np 
import spacy 


In [38]:
df = pd.read_csv("News Categoires.csv", encoding="latin-1")

In [39]:
df["Category"].value_counts()

Category
Entertainment               39
Food                        36
Economy                     35
Sports                      32
International relations     32
Health                      27
Artificial Intelligence     27
Politics                    24
Name: count, dtype: int64

In [21]:
df.isnull().sum()

News        0
Category    0
dtype: int64

In [44]:
min_sample=24
df_enter=df[df.Category=='Entertainment'].sample(min_sample,random_state=42)
df_food=df[df.Category=='Food'].sample(min_sample,random_state=42)
df_eco=df[df.Category=='Economy'].sample(min_sample,random_state=42)
df_sports=df[df.Category=='Sports'].sample(min_sample,random_state=42)
df_international_r=df[df.Category=='International relations'].sample(min_sample,random_state=42)
df_health=df[df.Category=='Health'].sample(min_sample,random_state=42)
df_politics=df[df.Category=='Politics'].sample(min_sample,random_state=42)

In [28]:
print(len(df_enter))

24


In [45]:
balanced_df=pd.concat([df_enter,df_food,df_eco,df_sports,df_international_r,df_health,df_politics],axis=0)

In [46]:
balanced_df

Unnamed: 0,News,Category
159,"Travis Kelce, star NFL player, sparks online b...",Entertainment
162,Teddi Mellencamp undergoes surgery to remove c...,Entertainment
50,Actress Yami Gautam Dhar speaks out about the ...,Entertainment
59,Kannada cinema sees a surge in excitement as D...,Entertainment
156,Aamir Khan takes an active role in preparation...,Entertainment
...,...,...
72,"On January 3, 2023, the U.S. House of Represen...",Politics
75,"On October 30, 2023, Jair Bolsonaro lost his r...",Politics
79,Iran and world powers reached a new agreement ...,Politics
247,The war in Ukraine continues with no end in si...,Politics


In [47]:
balanced_df.Category.value_counts()

Category
Entertainment              24
Food                       24
Economy                    24
Sports                     24
International relations    24
Health                     24
Politics                   24
Name: count, dtype: int64

In [54]:
balanced_df['Category_num']=balanced_df['Category'].map(
    {'Entertainment':0,
     'Food':1,
     'Economy':2,
     'Sports':3,
     'International relations':4,
     'Health':5,
     'Politics':6
    }
)

In [56]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(
    balanced_df.News,
    balanced_df.Category_num,
    test_size=0.2,
    random_state=42,
    stratify=balanced_df.Category_num
)

In [60]:
print(x_train.shape)

(134,)


In [61]:
x_train.head()

11     After a turbulent year, the cryptocurrency mar...
181    Negotiations and discussions regarding Iran's ...
162    Teddi Mellencamp undergoes surgery to remove c...
215    Novak Djokovic claimed his seventh Wimbledon t...
159    Travis Kelce, star NFL player, sparks online b...
Name: News, dtype: object

In [62]:
y_test.value_counts()

Category_num
6    5
2    5
4    5
5    5
1    5
3    5
0    4
Name: count, dtype: int64

In [75]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer


In [92]:
clf=Pipeline([
    ('count',CountVectorizer(ngram_range=(1,2))),
    ('model',MultinomialNB())
])

In [93]:
clf.fit(x_train,y_train)

0,1,2
,steps,"[('count', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [94]:
y_pred=clf.predict(x_test)


In [95]:
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       0.75      0.60      0.67         5
           2       1.00      0.40      0.57         5
           3       0.75      0.60      0.67         5
           4       0.57      0.80      0.67         5
           5       0.71      1.00      0.83         5
           6       0.50      0.80      0.62         5

    accuracy                           0.68        34
   macro avg       0.76      0.67      0.67        34
weighted avg       0.75      0.68      0.67        34

0.6764705882352942


In [96]:
x_test[:5]

249    The 28th Conference of the Parties to the Unit...
17     The gap between the rich and the poor is widen...
88     Talks between Iran and the United States on re...
23     Artificial Intelligence Is Helping to Diagnose...
98     A new study published in the journal JAMA Inte...
Name: News, dtype: object

In [97]:
y_test[:5]

249    6
17     2
88     4
23     5
98     1
Name: Category_num, dtype: int64

In [98]:
y_pred[:5]

array([4, 4, 4, 5, 5], dtype=int64)

In [106]:
balanced_df['Category'].value_counts()

Category
Entertainment              24
Food                       24
Economy                    24
Sports                     24
International relations    24
Health                     24
Politics                   24
Name: count, dtype: int64

In [107]:
balanced_df['Category_num'].value_counts()


Category_num
0    24
1    24
2    24
3    24
4    24
5    24
6    24
Name: count, dtype: int64

In [110]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [124]:
def preprocessed(text):
    doc=nlp(text)
    filtered_text=[]
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_text.append(token.lemma_)
    return " ".join(filtered_text)
        
        

In [125]:
preprocessed('thor ate pizza')

'thor eat pizza'

In [126]:
balanced_df['preprocessed_text']=balanced_df['News'].apply(preprocessed)

In [127]:
balanced_df.head()

Unnamed: 0,News,Category,Category_num,preprocessed_text
159,"Travis Kelce, star NFL player, sparks online b...",Entertainment,0,Travis Kelce star NFL player spark online ...
162,Teddi Mellencamp undergoes surgery to remove c...,Entertainment,0,Teddi Mellencamp undergo surgery remove cancer...
50,Actress Yami Gautam Dhar speaks out about the ...,Entertainment,0,Actress Yami Gautam Dhar speak importance supp...
59,Kannada cinema sees a surge in excitement as D...,Entertainment,0,Kannada cinema see surge excitement Darshan ac...
156,Aamir Khan takes an active role in preparation...,Entertainment,0,Aamir Khan take active role preparation daught...


In [128]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(
    balanced_df.preprocessed_text,
    balanced_df.Category_num,
    test_size=0.2,
    random_state=42,
    stratify=balanced_df.Category_num
)

In [141]:
clf=Pipeline([
    ('count',CountVectorizer(ngram_range=(1,2))),
    ('model',MultinomialNB())
])

In [142]:
clf.fit(x_train,y_train)

0,1,2
,steps,"[('count', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [143]:
y_pred=clf.predict(x_test)


In [144]:
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.75      0.60      0.67         5
           2       1.00      0.60      0.75         5
           3       1.00      0.80      0.89         5
           4       0.57      0.80      0.67         5
           5       0.62      1.00      0.77         5
           6       0.60      0.60      0.60         5

    accuracy                           0.74        34
   macro avg       0.79      0.74      0.74        34
weighted avg       0.79      0.74      0.74        34

0.7352941176470589
