In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
train = pd.read_csv("BBC News Train.csv")
test = pd.read_csv("BBC News Test.csv")

In [3]:
train.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB


In [5]:
train.isnull().sum()

ArticleId    0
Text         0
Category     0
dtype: int64

In [6]:
train["Category"].value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

In [7]:
train["Text"]

0       worldcom ex-boss launches defence lawyers defe...
1       german business confidence slides german busin...
2       bbc poll indicates economic gloom citizens in ...
3       lifestyle  governs mobile choice  faster  bett...
4       enron bosses in $168m payout eighteen former e...
                              ...                        
1485    double eviction from big brother model caprice...
1486    dj double act revamp chart show dj duo jk and ...
1487    weak dollar hits reuters revenues at media gro...
1488    apple ipod family expands market apple has exp...
1489    santy worm makes unwelcome visit thousands of ...
Name: Text, Length: 1490, dtype: object

In [8]:
from wordcloud import WordCloud, STOPWORDS

In [9]:
# plt.subplots(figsize=(10, 10))
# stopwords = set(STOPWORDS)
# text = " ".join(review for review in train["Text"])

# wc = WordCloud(stopwords=stopwords, max_font_size=70,colormap="Set2",
#               random_state=42, background_color='#151515', height=800, width=1000)
# wc.generate(text)
# plt.imshow(wc, interpolation="bilinear")
# plt.axis('off')
# plt.show()

In [10]:
train["Text"]

0       worldcom ex-boss launches defence lawyers defe...
1       german business confidence slides german busin...
2       bbc poll indicates economic gloom citizens in ...
3       lifestyle  governs mobile choice  faster  bett...
4       enron bosses in $168m payout eighteen former e...
                              ...                        
1485    double eviction from big brother model caprice...
1486    dj double act revamp chart show dj duo jk and ...
1487    weak dollar hits reuters revenues at media gro...
1488    apple ipod family expands market apple has exp...
1489    santy worm makes unwelcome visit thousands of ...
Name: Text, Length: 1490, dtype: object

In [11]:
test

Unnamed: 0,ArticleId,Text
0,1018,qpr keeper day heads for preston queens park r...
1,1319,software watching while you work software that...
2,1138,d arcy injury adds to ireland woe gordon d arc...
3,459,india s reliance family feud heats up the ongo...
4,1020,boro suffer morrison injury blow middlesbrough...
...,...,...
730,1923,eu to probe alitalia state aid the european ...
731,373,u2 to play at grammy awards show irish rock ba...
732,1704,sport betting rules in spotlight a group of mp...
733,206,alfa romeos to get gm engines fiat is to sto...


In [12]:
from nltk.corpus import stopwords

In [13]:
stop_words = set(stopwords.words("english"))

clean_train = train["Text"].apply(lambda x:" ".join(term for term in x.split() if term not in stop_words))

clean_test = test["Text"].apply(lambda x:" ".join(term for term in x.split() if term not in stop_words))

In [14]:
clean_test

0      qpr keeper day heads preston queens park range...
1      software watching work software monitor every ...
2      arcy injury adds ireland woe gordon arcy ruled...
3      india reliance family feud heats ongoing publi...
4      boro suffer morrison injury blow middlesbrough...
                             ...                        
730    eu probe alitalia state aid european commissio...
731    u2 play grammy awards show irish rock band u2 ...
732    sport betting rules spotlight group mps peers ...
733    alfa romeos get gm engines fiat stop making si...
734    citizenship event 18s touted citizenship cerem...
Name: Text, Length: 735, dtype: object

In [15]:
from nltk.stem import WordNetLemmatizer

wl = WordNetLemmatizer()

clean_train = clean_train.apply(lambda x:" ".join(term for term in x.split() if term not in stop_words))

clean_test = clean_test.apply(lambda x:" ".join(term for term in x.split() if term not in stop_words))

In [16]:
from sklearn.preprocessing import LabelEncoder
label_binarizer = LabelEncoder()
train["Category"] = label_binarizer.fit_transform(train["Category"])

In [17]:
train

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,0
1,154,german business confidence slides german busin...,0
2,1101,bbc poll indicates economic gloom citizens in ...,0
3,1976,lifestyle governs mobile choice faster bett...,4
4,917,enron bosses in $168m payout eighteen former e...,0
...,...,...,...
1485,857,double eviction from big brother model caprice...,1
1486,325,dj double act revamp chart show dj duo jk and ...,1
1487,1590,weak dollar hits reuters revenues at media gro...,0
1488,1587,apple ipod family expands market apple has exp...,4


In [18]:
train.drop("ArticleId", axis=1, inplace=True)
test.drop("ArticleId", axis=1, inplace=True)

In [19]:
train["Text"] = clean_train
test["Text"] = clean_test

In [20]:
from sklearn.model_selection import train_test_split

X = train.Text
y = train.Category

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
len(X_train)

1043

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [24]:
def acc_summary(pipeline, X_train, X_test, y_train, y_test):
    news_cat = pipeline.fit(X_train, y_train)
    y_pred = news_cat.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print("-"*30)
    
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    
    print("-"*30)
    
    return accuracy

In [25]:
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "Bernouli", "PassiveAggressiveClassifier",
     "Naive Bayes", "SVC"]

classifiers = [
    KNeighborsClassifier(n_neighbors=5),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(n_estimators=100),
    LogisticRegression(),
    BernoulliNB(),
    PassiveAggressiveClassifier(max_iter=50),
    MultinomialNB(),
    SVC(kernel="rbf")
]
    
zipped_clf = zip(names, classifiers)
tvec = TfidfVectorizer()
    
def compare_clf(classifier=zipped_clf, vectorizer=tvec, n_features=10000, ngram_range=(1, 1)):
    result = []
    vectorizer.set_params(stop_words=stop_words, max_features=n_features, ngram_range=ngram_range)
    for n, c in classifier:
        checker_pipeline = Pipeline([
            ("vectorizer", vectorizer),
            ("classifier", c)
        ])
        clf_acc = acc_summary(checker_pipeline, X_train, X_test, y_train, y_test)
        print("Model result for {}".format(n))
        print(c)
        result.append((n, clf_acc))
    return result

In [26]:
trigram_result = compare_clf()

------------------------------
accuracy score: 94.85%
------------------------------
Model result for K Nearest Neighbors
KNeighborsClassifier()
------------------------------
accuracy score: 79.87%
------------------------------
Model result for Decision Tree
DecisionTreeClassifier(random_state=42)
------------------------------
accuracy score: 95.97%
------------------------------
Model result for Random Forest
RandomForestClassifier()
------------------------------
accuracy score: 96.64%
------------------------------
Model result for Logistic Regression
LogisticRegression()
------------------------------
accuracy score: 97.54%
------------------------------
Model result for Bernouli
BernoulliNB()
------------------------------
accuracy score: 97.76%
------------------------------
Model result for PassiveAggressiveClassifier
PassiveAggressiveClassifier(max_iter=50)
------------------------------
accuracy score: 97.09%
------------------------------
Model result for Naive Bayes
Multi

In [27]:
trigram_result

[('K Nearest Neighbors', 0.9485458612975392),
 ('Decision Tree', 0.7986577181208053),
 ('Random Forest', 0.959731543624161),
 ('Logistic Regression', 0.9664429530201343),
 ('Bernouli', 0.9753914988814317),
 ('PassiveAggressiveClassifier', 0.9776286353467561),
 ('Naive Bayes', 0.970917225950783),
 ('SVC', 0.9686800894854586)]

In [28]:
def prediction(pipeline, testtext):
    news_cat = pipeline.fit(X_train, y_train)
    y_pred = news_cat.predict(testtext)
    
    return y_pred

In [29]:
vectorizer = TfidfVectorizer()
checker_pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", PassiveAggressiveClassifier())
])
vectorizer.set_params(stop_words=None, max_features=1000, ngram_range=(1,4))
prediction = prediction(checker_pipeline, test["Text"])

In [30]:
prediction

array([3, 4, 3, 0, 3, 3, 2, 2, 1, 0, 0, 4, 2, 4, 1, 3, 2, 4, 1, 1, 0, 2,
       3, 0, 2, 3, 0, 3, 3, 0, 2, 4, 0, 0, 3, 3, 3, 0, 1, 0, 4, 2, 1, 4,
       3, 4, 1, 0, 2, 0, 2, 0, 0, 0, 4, 2, 4, 1, 3, 4, 3, 1, 4, 2, 0, 1,
       3, 4, 3, 3, 0, 3, 0, 2, 4, 3, 4, 4, 4, 1, 2, 3, 1, 1, 0, 1, 0, 1,
       0, 4, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 2, 3, 2, 1, 0, 3, 2, 3, 2, 1,
       3, 0, 1, 3, 2, 3, 2, 3, 2, 0, 1, 0, 1, 1, 4, 3, 0, 1, 0, 1, 0, 2,
       2, 4, 0, 0, 2, 4, 1, 3, 0, 4, 3, 1, 2, 3, 3, 1, 1, 4, 0, 4, 2, 4,
       3, 3, 3, 3, 1, 4, 0, 4, 0, 4, 0, 4, 1, 4, 4, 2, 0, 2, 0, 0, 1, 2,
       4, 0, 0, 4, 3, 2, 3, 2, 4, 1, 2, 0, 2, 1, 2, 0, 1, 3, 4, 4, 0, 4,
       2, 0, 3, 2, 0, 1, 0, 0, 3, 4, 2, 3, 1, 1, 3, 1, 3, 4, 2, 1, 3, 1,
       3, 1, 2, 0, 4, 1, 0, 2, 0, 4, 0, 3, 2, 2, 2, 2, 3, 0, 1, 2, 3, 2,
       0, 3, 4, 0, 2, 0, 2, 0, 0, 3, 4, 2, 1, 4, 1, 4, 3, 3, 4, 3, 3, 3,
       1, 3, 2, 4, 0, 3, 0, 3, 0, 3, 1, 0, 0, 1, 2, 0, 3, 3, 4, 3, 3, 1,
       0, 3, 4, 2, 1, 0, 0, 2, 3, 1, 2, 0, 3, 3, 4,

In [32]:
predictions = label_binarizer.inverse_transform(prediction)
predictions

array(['sport', 'tech', 'sport', 'business', 'sport', 'sport', 'politics',
       'politics', 'entertainment', 'business', 'business', 'tech',
       'politics', 'tech', 'entertainment', 'sport', 'politics', 'tech',
       'entertainment', 'entertainment', 'business', 'politics', 'sport',
       'business', 'politics', 'sport', 'business', 'sport', 'sport',
       'business', 'politics', 'tech', 'business', 'business', 'sport',
       'sport', 'sport', 'business', 'entertainment', 'business', 'tech',
       'politics', 'entertainment', 'tech', 'sport', 'tech',
       'entertainment', 'business', 'politics', 'business', 'politics',
       'business', 'business', 'business', 'tech', 'politics', 'tech',
       'entertainment', 'sport', 'tech', 'sport', 'entertainment', 'tech',
       'politics', 'business', 'entertainment', 'sport', 'tech', 'sport',
       'sport', 'business', 'sport', 'business', 'politics', 'tech',
       'sport', 'tech', 'tech', 'tech', 'entertainment', 'politics',
   

In [33]:
def prediction(pipeline, testtext):
    news_cat = pipeline.fit(X_train, y_train)
    y_pred = news_cat.predict(testtext)
    
    return y_pred

In [34]:
vectorizer = TfidfVectorizer()
checker_pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", PassiveAggressiveClassifier())
])
vectorizer.set_params(stop_words=None, max_features=1000, ngram_range=(1,4))
data = ["woolf murder sentence rethink plans to give"]
prediction = prediction(checker_pipeline, data)

In [35]:
prediction

array([2])

In [36]:
pr = label_binarizer.inverse_transform(prediction)
pr

array(['politics'], dtype=object)

In [43]:
data = "wooLf murder sentence rethink plans to give"
[data.lower()]

['woolf murder sentence rethink plans to give']

In [47]:
import joblib
from joblib import dump, load

In [48]:
joblib.dump(checker_pipeline, "model.joblib")

['model.joblib']

In [69]:
model = load("model.joblib")

In [72]:
a = model.predict(["As a teenager, Crawley spent time in India to learn how to play spin. At 18, he went to play for Wembley Districts in Western Australia, notching six half-centuries in 15 innings"])
label_binarizer.inverse_transform(a)[0]

'tech'

In [75]:
label_binarizer.inverse_transform(train["Category"])

array(['business', 'business', 'business', ..., 'business', 'tech',
       'tech'], dtype=object)

In [77]:
train["Category"].unique()

array([0, 4, 2, 3, 1])

In [78]:
label_binarizer.inverse_transform(train["Category"].unique())

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)