In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from xgboost import XGBRFClassifier
from xgboost import XGBClassifier
from joblib import load, dump

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
dataset = pd.read_csv("BBC News Train.csv")
dataset.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [3]:
dataset.shape

(1490, 3)

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB


In [5]:
dataset['Category'].value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

In [6]:
# Associate Category names with numerical index and save it in new column CategoryId
target_category = dataset['Category'].unique()
print(target_category)

['business' 'tech' 'politics' 'sport' 'entertainment']


In [7]:
dataset['CategoryId'] = dataset['Category'].factorize()[0]
dataset.head()

Unnamed: 0,ArticleId,Text,Category,CategoryId
0,1833,worldcom ex-boss launches defence lawyers defe...,business,0
1,154,german business confidence slides german busin...,business,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,0
3,1976,lifestyle governs mobile choice faster bett...,tech,1
4,917,enron bosses in $168m payout eighteen former e...,business,0


In [8]:
# Create a new pandas dataframe "category", which only has unique Categories, also sorting this list in order of CategoryId values
category = dataset[['Category', 'CategoryId']].drop_duplicates().sort_values('CategoryId')
category

Unnamed: 0,Category,CategoryId
0,business,0
3,tech,1
5,politics,2
6,sport,3
7,entertainment,4


In [9]:
def remove_tags(text):
  remove = re.compile(r'')
  return re.sub(remove, '', text)
dataset['Text'] = dataset['Text'].apply(remove_tags)

In [10]:
def special_char(text):
  reviews = ''
  for x in text:
    if x.isalnum():
      reviews = reviews + x
    else:
      reviews = reviews + ' '
  return reviews
dataset['Text'] = dataset['Text'].apply(special_char)

In [11]:
def convert_lower(text):
   return text.lower()
dataset['Text'] = dataset['Text'].apply(convert_lower)

In [12]:
def remove_stopwords(text):
  stop_words = set(stopwords.words('english'))
  words = word_tokenize(text)
  return [x for x in words if x not in stop_words]
dataset['Text'] = dataset['Text'].apply(remove_stopwords)

In [13]:
def lemmatize_word(text):
  wordnet = WordNetLemmatizer()
  return " ".join([wordnet.lemmatize(word) for word in text])
dataset['Text'] = dataset['Text'].apply(lemmatize_word)

In [14]:
dataset

Unnamed: 0,ArticleId,Text,Category,CategoryId
0,1833,worldcom ex bos launch defence lawyer defendin...,business,0
1,154,german business confidence slide german busine...,business,0
2,1101,bbc poll indicates economic gloom citizen majo...,business,0
3,1976,lifestyle governs mobile choice faster better ...,tech,1
4,917,enron boss 168m payout eighteen former enron d...,business,0
...,...,...,...,...
1485,857,double eviction big brother model caprice holb...,entertainment,4
1486,325,dj double act revamp chart show dj duo jk joel...,entertainment,4
1487,1590,weak dollar hit reuters revenue medium group r...,business,0
1488,1587,apple ipod family expands market apple expande...,tech,1


In [50]:
x = np.array(dataset.iloc[:,0].values)
y = np.array(dataset.CategoryId.values)
cv = CountVectorizer(max_features = 5000)
x = cv.fit_transform(dataset.Text).toarray()
print("X.shape = ",x.shape)
print("y.shape = ",y.shape)

X.shape =  (1490, 5000)
y.shape =  (1490,)


In [51]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0, shuffle = True)
print(len(x_train))
print(len(x_test))

1043
447


In [17]:
lr = LogisticRegression()
lr = OneVsRestClassifier(lr)
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print(accuracy)

97.09


In [18]:
rfc = RandomForestClassifier(n_estimators=100 ,criterion='entropy' , random_state=0)
rfc = OneVsRestClassifier(rfc)
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print(accuracy)

97.99


In [19]:
mnb = MultinomialNB(alpha=1.0,fit_prior=True)
mnb = OneVsRestClassifier(mnb)
mnb.fit(x_train, y_train)
y_pred = mnb.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print(accuracy)

97.09


In [20]:
svc = SVC()
svc = OneVsRestClassifier(svc)
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print(accuracy)

96.64


In [21]:
dtc = DecisionTreeClassifier()
dtc = OneVsRestClassifier(dtc)
dtc.fit(x_train, y_train)
y_pred = dtc.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print(accuracy)

82.1


In [22]:
knn = KNeighborsClassifier(n_neighbors=10 , metric= 'minkowski' , p = 4)
knn = OneVsRestClassifier(knn)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print(accuracy)

73.6


In [23]:
gnb = GaussianNB()
gnb = OneVsRestClassifier(gnb)
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print(accuracy)

76.06


In [24]:
xgbc = XGBClassifier()
xgbc = OneVsRestClassifier(xgbc)
xgbc.fit(x_train, y_train)
y_pred = xgbc.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print(accuracy)



96.64


In [25]:
xgbrfc = XGBRFClassifier()
xgbrfc = OneVsRestClassifier(xgbrfc)
xgbrfc.fit(x_train, y_train)
y_pred = xgbrfc.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print(accuracy)



94.63


In [64]:
classifier = RandomForestClassifier(n_estimators=100 ,criterion='entropy' , random_state=0)
classifier =  OneVsRestClassifier(classifier)
classifier.fit(x_train, y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(criterion='entropy',
                                                     random_state=0))

In [65]:
model = {
    'title':'News Category',
    'classifier':classifier,
    'vectorizer':cv
}
dump(model,'news.h5')

['news.h5']

In [66]:
def load_model():
    filepath = 'news.h5'
    return load(filepath)

In [67]:
def pred(inp):
    userinp=[inp]
    x = load_model().get('vectorizer').transform(userinp)
    p = load_model().get('classifier').predict(x)
    print(p)
    if p == [0]:
        return "Business News"
    elif p == [1]:
        return "Tech News"
    elif p == [2]:
        return "Politics News"
    elif p == [3]:
        return "Sports News"
    elif p == [4]:
        return "Entertainment News"

In [68]:
pred('Hour ago, I contemplated retirement for a lot of reasons. I felt like people were not sensitive enough to my injuries. I felt like a lot of people were backed, why not me? I have done no less. I have won a lot of games for the team, and I am not feeling backed, said Ashwin')

[3]


'Sports News'

In [69]:
pred('german business confidence slides german business')

[0]


'Business News'