In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,confusion_matrix,ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.pipeline import make_pipeline 
from sklearn.ensemble import RandomForestClassifier
from string import punctuation
# from nltk.corpus import stopwords
# stop_words = set(stopwords.words('english'))
# punctuation = punctuation + '\n' + '—' + '“' + ',' + '”' + '‘' + '-' + '’'
import re
import joblib


In [11]:
df = pd.read_csv('./bbc-news-data.csv',sep='\t')
df.head()

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [12]:
X,y = df.loc[:,'content'], df.loc[:,'category']
X.shape == y.shape

True

In [14]:
stop_words = set(
    [
        "i",
        "me",
        "my",
        "myself",
        "we",
        "our",
        "ours",
        "ourselves",
        "you",
        "your",
        "yours",
        "yourself",
        "yourselves",
        "he",
        "him",
        "his",
        "himself",
        "she",
        "her",
        "hers",
        "herself",
        "it",
        "its",
        "itself",
        "they",
        "them",
        "their",
        "theirs",
        "themselves",
        "what",
        "which",
        "who",
        "whom",
        "this",
        "that",
        "these",
        "those",
        "am",
        "is",
        "are",
        "was",
        "were",
        "be",
        "been",
        "being",
        "have",
        "has",
        "had",
        "having",
        "do",
        "does",
        "did",
        "doing",
        "a",
        "an",
        "the",
        "and",
        "but",
        "if",
        "or",
        "because",
        "as",
        "until",
        "while",
        "of",
        "at",
        "by",
        "for",
        "with",
        "about",
        "against",
        "between",
        "into",
        "through",
        "during",
        "before",
        "after",
        "above",
        "below",
        "to",
        "from",
        "up",
        "down",
        "in",
        "out",
        "on",
        "off",
        "over",
        "under",
        "again",
        "further",
        "then",
        "once",
        "here",
        "there",
        "when",
        "where",
        "why",
        "how",
        "all",
        "any",
        "both",
        "each",
        "few",
        "more",
        "most",
        "other",
        "some",
        "such",
        "no",
        "nor",
        "not",
        "only",
        "own",
        "same",
        "so",
        "than",
        "too",
        "very",
        "s",
        "t",
        "can",
        "will",
        "just",
        "don",
        "should",
        "now",
    ]
)


In [15]:
def preprocessing(text):
    # removing non alphanumeric characters
    text = re.sub(r"[^A-Z a-z 0-9]", "", text)

    # removing punctuations
    text = "".join(char for char in text if char not in punctuation)

    # removing trailing whitespaces again
    text = re.sub(r" +", " ", text)

    # removing the stopword
    text = " ".join(word for word in text.split() if word not in stop_words)

    return text.lower()

In [16]:
X = X.apply(lambda x:preprocessing(x))

In [17]:
X.shape

(2225,)

In [18]:
pipe = make_pipeline(TfidfVectorizer(),LogisticRegression())
pipe

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('logisticregression', LogisticRegression())])

In [19]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(y)
X_train , X_val, y_train, y_val = train_test_split(X,labels,test_size=0.2,stratify=labels,random_state=42)

In [20]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('logisticregression', LogisticRegression())])

In [21]:
y_pred = pipe.predict(X_val)
f1_score(y_val,y_pred,average='weighted')

0.9887325858141044

# Retrain the whole model and save it in a file

In [23]:
pipe.fit(X,y)
filename = './category_predictor.sav'
joblib.dump(pipe, filename)

['./category_predictor.sav']