In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
import pandas as pd
df = pd.read_json('news_dataset.json')

In [5]:
print(df.shape)
df.head()

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [13]:
df.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

## Random Over Sampling , SMOTE

In [14]:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens= []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return" ".join(filtered_tokens)
        

In [15]:
preprocess("Fake news refers to misinformation or disinformation in the country which is spread through word of mouth and more recently through digital communication such as What's app messages, social media posts, etc.")

'fake news refer misinformation disinformation country spread word mouth recently digital communication app message social medium post etc'

In [22]:
categories = df['category'].unique()
categories

array(['SCIENCE', 'BUSINESS', 'CRIME', 'SPORTS'], dtype=object)

In [24]:
min_samples = 1381
sampled_dfs=[]
for category in categories:
    sampled_dfs.append(df[df.category == category].sample(min_samples, random_state=2022))

# Concatenate all sampled DataFrames
df_balanced = pd.concat(sampled_dfs)

In [26]:
df_balanced.category.value_counts()

category
SCIENCE     1381
BUSINESS    1381
CRIME       1381
SPORTS      1381
Name: count, dtype: int64

In [54]:
df_balanced["text_new"] = df.text.apply(preprocess)

In [55]:
target = {'SCIENCE': 0, 'BUSINESS': 1, 'CRIME':2,'SPORTS':3}
df_balanced['category_num'] = df_balanced['category'].map(target)
df_balanced.head()

Unnamed: 0,text,category,category_num,text_new
7210,It's Time We Take A Look At How Female Astrono...,SCIENCE,0,Time look female astronomer treat Hidden figur...
12292,Scientists Reveal The Secret Key To Charisma I...,SCIENCE,0,scientist reveal secret Key Charisma think foo...
6249,Watch One Of The World's Largest Lakes Shrink ...,SCIENCE,0,watch World Largest Lakes Shrink eye go go nea...
379,'Falling Fruit' Map Helps Foragers Find Their ...,SCIENCE,0,fall fruit Map help forager find Free Meal say...
12505,When Science Fiction Gets Real Many science fi...,SCIENCE,0,Science Fiction get real science fiction write...


In [43]:
df_balanced.category_num.value_counts()

category_num
0    1381
1    1381
2    1381
3    1381
Name: count, dtype: int64

In [56]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(
    df_balanced.text_new,
    df_balanced.category_num,
    test_size=0.2, 
    random_state=2022,
    stratify=df_balanced.category_num   
)

In [58]:
print(X_train.shape)
X_train.head()

(4419,)


4769       Bully Poker Pro 27.8 million Chips Pot abusive
5724    Long National Deflategate Nightmare Vengeance ...
1923    man care Diversity Boardroom boy boy club ok s...
1733    Beloved Bipedal Bear name Pedals believe kill ...
2526      Elizabeth Smart give Birth Baby Girl Father say
Name: text_new, dtype: object

In [63]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer


clf= Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),        #using the ngram_range parameter 
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.90      0.84      0.87       276
           1       0.84      0.86      0.85       277
           2       0.85      0.92      0.88       276
           3       0.92      0.88      0.90       276

    accuracy                           0.87      1105
   macro avg       0.87      0.87      0.87      1105
weighted avg       0.87      0.87      0.87      1105



In [64]:
clf= Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range = (1, 2))),        #using the 2 gram_range parameter 
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.90      0.82      0.86       276
           1       0.84      0.88      0.86       277
           2       0.86      0.92      0.89       276
           3       0.93      0.89      0.91       276

    accuracy                           0.88      1105
   macro avg       0.88      0.88      0.88      1105
weighted avg       0.88      0.88      0.88      1105



In [65]:
clf= Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range = (1, 3))),        #using the 3 gram_range parameter 
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.91      0.82      0.86       276
           1       0.84      0.89      0.86       277
           2       0.86      0.92      0.89       276
           3       0.93      0.89      0.91       276

    accuracy                           0.88      1105
   macro avg       0.88      0.88      0.88      1105
weighted avg       0.88      0.88      0.88      1105



## Random Forest

In [79]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range =(1,3))),
    ('Random_Forest',RandomForestClassifier())
])

In [80]:
clf_rf.fit(X_train,y_train)
y_pred = clf_rf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.77      0.79      0.78       276
           1       0.69      0.83      0.76       277
           2       0.83      0.86      0.84       276
           3       0.93      0.68      0.79       276

    accuracy                           0.79      1105
   macro avg       0.80      0.79      0.79      1105
weighted avg       0.80      0.79      0.79      1105



## over sampling

In [93]:

business_count = df[df['category'] == 'BUSINESS'].shape[0]

print(science_count)

0


## GitHub , MlFlow

In [92]:
df.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [98]:
# Get the count of the BUSINESS category
business_count = df[df['category'] == 'BUSINESS'].shape[0]

# Oversample the SCIENCE category to match the BUSINESS category count
df_science_over_sm = df[df['category'] == 'SCIENCE'].sample(business_count, replace=True)

# Print the resulting oversampled dataframe
print(df_science_over_sm.shape)


(4254, 3)


In [None]:
df_science_over_sm