In [70]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from flask import Flask, request, jsonify
import joblib

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [71]:
data_file = pd.read_csv('BBC News Train.csv')
data_file

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment
1487,1590,weak dollar hits reuters revenues at media gro...,business
1488,1587,apple ipod family expands market apple has exp...,tech


In [72]:
x = data_file['Text']
y = data_file['Category']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

In [73]:
clf = MultinomialNB()
clf.fit(x_train_vec, y_train)

In [74]:
y_pred = clf.predict(x_test_vec)
print(len(y_pred))

298


In [75]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.959731543624161


In [76]:
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

     business       0.95      0.97      0.96        75
entertainment       1.00      0.96      0.98        46
     politics       0.91      0.95      0.93        56
        sport       0.97      1.00      0.98        63
         tech       0.98      0.91      0.95        58

     accuracy                           0.96       298
    macro avg       0.96      0.96      0.96       298
 weighted avg       0.96      0.96      0.96       298



In [77]:
test_bcc = pd.read_csv("BBC News Test.csv")
test_x = test_bcc['Text']
print(type(test_x))
test_x_vec = vectorizer.transform(test_x)
test_y_pred = clf.predict(test_x_vec)
print(test_y_pred)

<class 'pandas.core.series.Series'>


['sport' 'tech' 'sport' 'business' 'sport' 'sport' 'politics' 'politics'
 'entertainment' 'business' 'business' 'tech' 'politics' 'tech'
 'entertainment' 'sport' 'politics' 'tech' 'entertainment' 'entertainment'
 'business' 'politics' 'sport' 'business' 'politics' 'sport' 'business'
 'sport' 'sport' 'business' 'politics' 'tech' 'business' 'business'
 'sport' 'sport' 'sport' 'business' 'entertainment' 'entertainment' 'tech'
 'politics' 'entertainment' 'tech' 'sport' 'tech' 'entertainment'
 'business' 'politics' 'business' 'politics' 'business' 'business'
 'business' 'tech' 'politics' 'tech' 'entertainment' 'sport' 'tech'
 'sport' 'entertainment' 'tech' 'politics' 'business' 'entertainment'
 'sport' 'tech' 'sport' 'sport' 'tech' 'sport' 'business' 'politics'
 'tech' 'sport' 'tech' 'tech' 'tech' 'entertainment' 'politics' 'sport'
 'entertainment' 'entertainment' 'business' 'entertainment' 'business'
 'entertainment' 'business' 'tech' 'business' 'politics' 'sport' 'tech'
 'sport' 'sport' '

In [78]:
joblib.dump(clf, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']

In [79]:
app = Flask(__name__)

clfModel = joblib.load('model.pkl')
vector= joblib.load('vectorizer.pkl')
@app.route("/")
def predictCat():
    data = request.get_json()
    text=data['text']
    print(len(text))
    vec = vector.transform(text)
    pred = clfModel.predict(vec)
    res=list(pred)
    return jsonify(res)

if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [19/May/2024 15:45:00] "GET / HTTP/1.1" 200 -


1
