In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv("E:\I Neuron Project\Dataset\BBC News Train.csv")
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [3]:
df.shape

(1490, 3)

In [4]:
df["ArticleId"].unique()

array([1833,  154, 1101, ..., 1590, 1587,  538], dtype=int64)

In [5]:
df.drop("ArticleId",axis=1,inplace=True)

In [6]:
df.head()

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      1490 non-null   object
 1   Category  1490 non-null   object
dtypes: object(2)
memory usage: 23.4+ KB


# Text Cleaning

In [8]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps=PorterStemmer()

In [9]:
corpus=[]
for i in range (len(df)):
    rp=re.sub('[^a-zA-Z]'," ",df['Text'][i])
    rp=rp.lower()
    rp=rp.split()
    rp=[ps.stem(word) for word in rp if not word in set(stopwords.words('english'))]
    rp=" ".join(rp)
    corpus.append(rp)

# Vectorization

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
X=cv.fit_transform(corpus).toarray()

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
X1=tf.fit_transform(corpus).toarray()
X1

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
df['Category'].unique()

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)

In [12]:
y=df['Category'].map({'business':0,'entertainment':1,'politics':2,'sport':3,'tech':4})
y

0       0
1       0
2       0
3       4
4       0
       ..
1485    1
1486    1
1487    0
1488    4
1489    4
Name: Category, Length: 1490, dtype: int64

In [13]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# Train Test Split

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1000)

In [17]:
X1_train,X1_test,y_train,y_test=train_test_split(X1,y,test_size=0.3,random_state=1000)

# Modeling

### Navie Bayes Classifier with default parameters

In [18]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(X_train,y_train)

In [20]:
model_tf=MultinomialNB()
model_tf.fit(X1_train,y_train)

In [None]:
from sklearn.model_selection import GridSearchCV

##HyperParamerterTuning
est=MultinomialNB()
par={'alpha':[0.1,0.3,0.5,0.7,0.9,1]}

esc=GridSearchCV(est,par,cv=5)
esc.fit(X_train,y_train)

esc.best_params_

In [21]:
model1=MultinomialNB(alpha=0.3)
model1.fit(X_train,y_train)

In [22]:
#Prediction
ypred_test=model1.predict(X_test)
ypred_train=model1.predict(X_train)

In [23]:
#Prediction for tf-idf
ypred_test_tf=model_tf.predict(X1_test)
ypred_train_tf=model_tf.predict(X1_train)

In [24]:
ypred=model1.predict(X)

In [31]:
from sklearn.neighbors import KNeighborsClassifier
model2=KNeighborsClassifier()
model2.fit(X_train,y_train)
train_pred=model2.predict(X_train) #train prediction
test_pred=model2.predict(X_test)
print(accuracy_score(y_train,train_pred))
print(accuracy_score(y_test,test_pred))

0.7823585810162992
0.6219239373601789


In [32]:
from sklearn.linear_model import LogisticRegression
model3=LogisticRegression()
model3.fit(X_train,y_train)
train_pred1=model3.predict(X_train) #train prediction
test_pred1=model3.predict(X_test)
print(accuracy_score(y_train,train_pred1))
print(accuracy_score(y_test,test_pred1))

1.0
0.970917225950783


In [33]:
#Decision tree Classifier

from sklearn.tree import DecisionTreeClassifier

model4=DecisionTreeClassifier()

model4.fit(X_train,y_train)

train_pred4=model4.predict(X_train)
test_pred4=model4.predict(X_test)

print(accuracy_score(y_train,train_pred4))
print(accuracy_score(y_test,test_pred4))


1.0
0.8076062639821029


In [35]:
# Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
model5=RandomForestClassifier(random_state=10)

model5.fit(X_train,y_train)

train_pred5=model5.predict(X_train)
test_pred5=model5.predict(X_test)

train_score5=model5.score(X_train,y_train)
test_score5=model5.score(X_test,y_test)

print(accuracy_score(y_train,train_pred5))
print(accuracy_score(y_test,test_pred5))


1.0
0.9530201342281879


In [36]:
# Adaboost

from sklearn.ensemble import AdaBoostClassifier

model6=AdaBoostClassifier()

model6.fit(X_train,y_train)

train_pred6=model6.predict(X_train)
test_pred6=model6.predict(X_test)

train_score6=model6.score(X_train,y_train)
test_score6=model6.score(X_test,y_test)

print(accuracy_score(y_train,train_pred6))
print(accuracy_score(y_test,test_pred6))

0.8542665388302972
0.785234899328859


In [37]:
#Gradient Boost
from sklearn.ensemble import GradientBoostingClassifier

model7=GradientBoostingClassifier(random_state=10)

model7.fit(X_train,y_train)

train_pred7=model7.predict(X_train)
test_pred7=model7.predict(X_test)

print(accuracy_score(y_train,train_pred7))
print(accuracy_score(y_test,test_pred7))

1.0
0.9507829977628636


In [39]:
# XGBoost

from xgboost import XGBClassifier

model8=XGBClassifier()

model8.fit(X_train,y_train)

train_pred8=model8.predict(X_train)
test_pred8=model8.predict(X_test)

print(accuracy_score(y_train,train_pred8))
print(accuracy_score(y_test,test_pred8))

1.0
0.9507829977628636


# Evalution

In [25]:
#Navie Bayes with BOW

from sklearn.metrics import accuracy_score
train_accuracy=accuracy_score(y_train,ypred_train)
test_accuracy=accuracy_score(y_test,ypred_test)
print("Train accuracy : ",train_accuracy)
print("Test accuracy : ",test_accuracy)

Train accuracy :  0.9961649089165868
Test accuracy :  0.9664429530201343


In [26]:
#vavie Bayes with Tf-idf

train_accuracy1=accuracy_score(y_train,ypred_train_tf)
test_accuracy1=accuracy_score(y_test,ypred_test_tf)
print("Train accuracy : ",train_accuracy1)
print("Test accuracy : ",test_accuracy1)

Train accuracy :  0.9904122722914669
Test accuracy :  0.9686800894854586


In [41]:
ypred_log=model3.predict(X)

In [43]:
accuracy1=accuracy_score(y,ypred_log)
accuracy1

0.991275167785235

In [27]:
ypred_test

array([3, 3, 1, 2, 4, 4, 1, 2, 3, 2, 2, 2, 0, 0, 4, 2, 0, 1, 3, 4, 1, 2,
       4, 1, 2, 2, 1, 1, 2, 1, 0, 4, 0, 1, 0, 3, 1, 1, 0, 0, 2, 3, 1, 4,
       0, 0, 3, 3, 0, 0, 0, 0, 4, 3, 4, 2, 0, 1, 4, 0, 0, 3, 1, 3, 0, 1,
       4, 4, 1, 4, 4, 1, 2, 0, 1, 3, 0, 2, 1, 4, 4, 2, 1, 1, 2, 4, 2, 3,
       1, 3, 2, 1, 4, 4, 0, 4, 3, 4, 2, 2, 3, 4, 2, 2, 3, 1, 0, 2, 4, 0,
       3, 1, 1, 0, 1, 2, 0, 2, 4, 1, 3, 1, 4, 0, 0, 2, 4, 0, 0, 1, 1, 2,
       1, 2, 0, 0, 4, 1, 4, 4, 1, 4, 0, 2, 4, 1, 4, 4, 3, 4, 0, 3, 3, 0,
       3, 1, 4, 4, 0, 2, 3, 4, 3, 3, 2, 0, 2, 3, 3, 1, 3, 4, 1, 4, 0, 0,
       3, 4, 2, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 3, 3, 3, 2, 0, 2, 1,
       2, 3, 1, 1, 1, 2, 2, 4, 0, 3, 4, 1, 4, 3, 1, 3, 1, 3, 0, 2, 2, 4,
       4, 2, 3, 3, 4, 0, 2, 4, 3, 3, 3, 3, 0, 4, 1, 2, 0, 4, 3, 3, 3, 2,
       4, 2, 1, 0, 2, 3, 4, 2, 2, 4, 1, 2, 0, 2, 1, 4, 4, 2, 0, 1, 3, 3,
       1, 2, 0, 2, 0, 4, 1, 2, 1, 0, 0, 0, 3, 4, 3, 1, 4, 2, 3, 2, 3, 0,
       3, 2, 3, 2, 1, 4, 0, 3, 4, 3, 3, 0, 1, 0, 1,

In [28]:
ypred

array([0, 0, 0, ..., 0, 4, 4], dtype=int64)

In [29]:
accuracy=accuracy_score(y,ypred)
accuracy

0.987248322147651

### Best Acuuracy is with Logistic Regression

In [44]:
import pickle

In [45]:
pickle.dump(model3,open("NewsArtical.pkl","wb"))

In [46]:
df1=pd.read_csv("E:\I Neuron Project\Dataset\BBC News Test.csv")
df1.head()

Unnamed: 0,ArticleId,Text
0,1018,qpr keeper day heads for preston queens park r...
1,1319,software watching while you work software that...
2,1138,d arcy injury adds to ireland woe gordon d arc...
3,459,india s reliance family feud heats up the ongo...
4,1020,boro suffer morrison injury blow middlesbrough...


In [47]:
df1.shape

(735, 2)

In [48]:
corpus1=[]
for i in range (len(df1)):
    rp=re.sub('[^a-zA-Z]'," ",df1['Text'][i])
    rp=rp.lower()
    rp=rp.split()
    rp=[ps.stem(word) for word in rp if not word in set(stopwords.words('english'))]
    rp=" ".join(rp)
    corpus1.append(rp)

In [49]:
X2=cv.transform(corpus1).toarray()

In [50]:
model_test=pickle.load(open("NewsArtical.pkl","rb"))

In [51]:
predict=model_test.predict(X2)
predict

array([3, 4, 3, 0, 3, 3, 2, 2, 1, 0, 0, 4, 2, 4, 1, 3, 2, 4, 1, 1, 0, 2,
       3, 0, 2, 3, 0, 3, 3, 0, 2, 4, 0, 0, 3, 3, 3, 0, 1, 1, 4, 2, 1, 4,
       3, 4, 1, 0, 2, 0, 2, 0, 0, 0, 4, 2, 4, 1, 3, 4, 3, 1, 4, 2, 1, 1,
       3, 4, 3, 3, 4, 3, 0, 2, 4, 3, 4, 4, 4, 1, 2, 3, 1, 1, 0, 1, 0, 1,
       0, 4, 0, 2, 3, 4, 3, 3, 3, 3, 3, 3, 2, 3, 2, 1, 0, 3, 2, 3, 2, 1,
       3, 0, 1, 3, 2, 3, 2, 3, 2, 0, 1, 0, 1, 1, 4, 3, 0, 1, 0, 1, 0, 2,
       2, 4, 0, 0, 2, 4, 1, 3, 0, 4, 3, 1, 2, 3, 3, 1, 1, 4, 0, 4, 2, 4,
       3, 3, 3, 3, 1, 4, 0, 4, 0, 4, 0, 4, 1, 4, 4, 2, 0, 2, 0, 0, 1, 2,
       4, 0, 0, 4, 3, 2, 3, 0, 4, 4, 2, 0, 2, 1, 2, 0, 1, 3, 4, 4, 0, 4,
       2, 0, 3, 2, 0, 1, 0, 0, 3, 4, 0, 3, 1, 1, 3, 1, 3, 4, 2, 1, 3, 1,
       3, 1, 2, 0, 4, 1, 0, 2, 0, 4, 0, 3, 2, 2, 0, 2, 3, 0, 1, 2, 3, 2,
       0, 3, 4, 0, 2, 0, 2, 0, 0, 3, 4, 2, 1, 4, 1, 4, 3, 3, 4, 3, 3, 3,
       1, 3, 2, 4, 0, 3, 0, 3, 0, 3, 1, 0, 0, 1, 2, 0, 3, 3, 4, 3, 3, 1,
       0, 3, 4, 2, 1, 0, 0, 2, 3, 1, 2, 0, 3, 3, 4,

In [52]:
predict.shape

(735,)

In [53]:
sub=pd.DataFrame({"ArticleId":df1["ArticleId"],"text":df1["Text"],"Final_prediction":predict})
sub

Unnamed: 0,ArticleId,text,Final_prediction
0,1018,qpr keeper day heads for preston queens park r...,3
1,1319,software watching while you work software that...,4
2,1138,d arcy injury adds to ireland woe gordon d arc...,3
3,459,india s reliance family feud heats up the ongo...,0
4,1020,boro suffer morrison injury blow middlesbrough...,3
...,...,...,...
730,1923,eu to probe alitalia state aid the european ...,0
731,373,u2 to play at grammy awards show irish rock ba...,1
732,1704,sport betting rules in spotlight a group of mp...,0
733,206,alfa romeos to get gm engines fiat is to sto...,0


In [None]:
sub['Final_prediction']=sub['Final_prediction'].map({0:'business',1:'entertainment',2:'politics',3:'sport',4:'tech'})
sub