### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

### Reading Data

In [2]:
folder_path = 'bbcsport'

data = []
labels = []
for sport_folder in os.listdir(folder_path):
    sport_folder_path = os.path.join(folder_path, sport_folder)
    if os.path.isdir(sport_folder_path):
        for txt_file in os.listdir(sport_folder_path):
            txt_file_path = os.path.join(sport_folder_path, txt_file)
            with open(txt_file_path, 'r', encoding='utf-8') as file:
                data.append(file.read())
                labels.append(sport_folder)

df = pd.DataFrame({'text': data, 'label': labels})

### Data Exploration

In [3]:
df.sample(5)

Unnamed: 0,text,label
167,England require extra 'intensity'\n\nBatsman G...,cricket
571,Williams stays on despite dispute\n\nMatt Will...,rugby
44,Trial date is set for Balco case\n\nA US judge...,athletics
608,Scrum-half Williams rejoins Bath\n\nBath have ...,rugby
233,Duff ruled out of Barcelona clash\n\nChelsea's...,football


In [4]:
df.shape

(737, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 737 entries, 0 to 736
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    737 non-null    object
 1   label   737 non-null    object
dtypes: object(2)
memory usage: 11.6+ KB


In [6]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [7]:
df.duplicated().sum()

10

In [8]:
df = df.drop_duplicates(keep='first')

In [9]:
df.duplicated().sum()

0

### Data Preprocessing

In [10]:
df['label'].value_counts()

label
football     262
rugby        146
cricket      121
athletics     99
tennis        99
Name: count, dtype: int64

In [11]:
label_map = {'football': 0, 'rugby': 1, 'cricket': 2, 'athletics': 3, 'tennis': 4}
df['label'] = df['label'].map(label_map)

In [12]:
df.sample(5)

Unnamed: 0,text,label
108,Bracewell worried by Lee\n\nNew Zealand coach ...,2
9,Collins to compete in Birmingham\n\nWorld and ...,3
473,Charlton 1-2 Liverpool\n\nFernando Morientes g...,0
202,ICC relaxes bowling regulations\n\nThe Interna...,2
701,Koubek suspended after drugs test\n\nStefan Ko...,4


In [13]:
import re

In [14]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [15]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

In [16]:
def text_preprocessing(col):
    # Lower case
    col = col.str.lower()

    # Removing stopwords and words with length=1
    col = col.apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words and len(word) > 1))
    
    # Removing numbers
    col = col.apply(lambda x: re.sub(r'[0-9]', '', x)) 
    
    # Removing extra spaces
    col = col.apply(lambda x: re.sub(r'\s+', ' ', x).strip()) 

    # Stemming
    col = col.apply(lambda x: ' '.join(stemmer.stem(word) for word in str(x).split()))

    return col

In [17]:
df['text'] = text_preprocessing(df['text'])

In [18]:
df.sample(5)

Unnamed: 0,text,label
307,thompson say gerrard stay liverpool legend phi...,0
296,prutton pois lengthi fa ban southampton david ...,0
61,brizzel run aaa sheffield ballymena sprinter p...,3
537,o'driscoll/gregan lead aid star ireland brian ...,1
104,pakistan reveng mission pakistan cricket depar...,2


### Data Splitting

In [19]:
X = df['text']
y = df['label']

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=31)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=6000)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [22]:
print(X_train.shape)
print(X_test.shape)

(581, 6000)
(146, 6000)


### Data Modeling

In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [24]:
mnb = MultinomialNB(alpha=.1)
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)

In [25]:
print('Train Accuracy', accuracy_score(y_train, mnb.predict(X_train)))
print('Classification Report:\n', classification_report(y_train, mnb.predict(X_train)))
print('Confusion Matrix:\n', confusion_matrix(y_train, mnb.predict(X_train)))

Train Accuracy 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       209
           1       1.00      1.00      1.00       117
           2       1.00      1.00      1.00        97
           3       1.00      1.00      1.00        79
           4       1.00      1.00      1.00        79

    accuracy                           1.00       581
   macro avg       1.00      1.00      1.00       581
weighted avg       1.00      1.00      1.00       581

Confusion Matrix:
 [[209   0   0   0   0]
 [  0 117   0   0   0]
 [  0   0  97   0   0]
 [  0   0   0  79   0]
 [  0   0   0   0  79]]


In [26]:
print('Test Accuracy', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

Test Accuracy 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        53
           1       1.00      1.00      1.00        29
           2       1.00      1.00      1.00        24
           3       1.00      1.00      1.00        20
           4       1.00      1.00      1.00        20

    accuracy                           1.00       146
   macro avg       1.00      1.00      1.00       146
weighted avg       1.00      1.00      1.00       146

Confusion Matrix:
 [[53  0  0  0  0]
 [ 0 29  0  0  0]
 [ 0  0 24  0  0]
 [ 0  0  0 20  0]
 [ 0  0  0  0 20]]


### Saving Models

In [27]:
import joblib

joblib.dump(vectorizer, 'sport_vectorizer.pkl')
joblib.dump(mnb, 'sport_model.pkl')

['sport_model.pkl']

### Predicting Function

In [28]:
def predict_sport_news(text):
    model = joblib.load('sport_model.pkl')
    vectorizer = joblib.load('sport_vectorizer.pkl')
    
    # Preprocess the text
    text_series = pd.Series([text])
    text_series = text_preprocessing(text_series)
    text_vectorized = vectorizer.transform(text_series)
    
    # Predict the class
    prediction = model.predict(text_vectorized)[0]
    
    # Decode the class
    label_map_reverse = {v: k for k, v in label_map.items()}
    predicted_label = label_map_reverse[prediction]
    
    return predicted_label


In [29]:
news = "Australia secures a dramatic win over England in the Ashes series opener."
predicted_sport = predict_sport_news(news)
print(f"The predicted sport for the given news is: {predicted_sport}")

The predicted sport for the given news is: cricket
