# BBC News article classification:

In [10]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

In [11]:
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
layers = keras.layers
models = keras.models

# Collect the data

In [12]:
data = pd.read_csv("C:/Users/ANIL DANU/Documents/bbc-text.csv")   #provided by pandas

In [24]:
data.head()    #this will display the first five rows of the data

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [25]:
data['category'].value_counts() #provided by pandas

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

# Preprocessing of data:

In [26]:
train_size = int(len(data) * .8)                     #define training and testing data size
print('Size of the training data :',train_size)
print('size of the test data :',len(data)-train_size)

Size of the training data : 1780
size of the test data : 445


In [16]:
def train_test_split(data, train_size):                #spliting the data into train and test data
    train = data[:train_size]
    test = data[train_size:]
    return train, test

In [27]:
train_cat, test_cat = train_test_split(data['category'], train_size)
train_text, test_text = train_test_split(data['text'], train_size)

In [29]:
max_words = 1000       
tokenize = keras.preprocessing.text.Tokenizer(num_words=max_words, char_level=False)  #This class vectorize the text corpus

#num_words: the maximum number of words to keep, based on word frequency. Only the most common num_words-1 words will be kept.
#char_level: if True, every character will be treated as a token.
tokenize.fit_on_texts(train_text) 
word_index = tokenize.word_index
#print(word_index)

In [31]:
x_train = tokenize.texts_to_matrix(train_text,mode='count')    #converting text the matrix   mode:binary,count,freq,tfidf
x_test = tokenize.texts_to_matrix(test_text,mode='count')
print(x_train.shape)
#print(x_train[0])

(1780, 1000)


In [33]:
encoder = LabelEncoder()            #Encode target labels with value between 0 and n_classes-1.
encoder.fit(train_cat)
print(list(encoder.classes_))
y_train = encoder.transform(train_cat)  #transfrom training data to corresponding categorical values
y_test = encoder.transform(test_cat)
print(y_train)

['business', 'entertainment', 'politics', 'sport', 'tech']
[4 0 3 ... 3 0 1]


In [34]:
num_classes = np.max(y_train) + 1
y_train = keras.utils.to_categorical(y_train, num_classes)  #transfrom y_train from categorical to binary representation
y_test = keras.utils.to_categorical(y_test, num_classes)
print(y_train)

[[0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 ...
 [0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]]


In [35]:
print('x_train shape:', x_train.shape)    #checking the dimension of different matrix
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (1780, 1000)
x_test shape: (445, 1000)
y_train shape: (1780, 5)
y_test shape: (445, 5)


# Build and Train the model:

In [37]:
batch_size = 16   # number of training examples utilized in one iteration
epochs = 2   #The number of epochs is a hyperparameter that defines the number times that the learning algorithm will work
#through the entire training dataset. One epoch means that each sample in the training dataset has had an opportunity to 
#update the internal model parameters.
drop_ratio = 0.5   #it is used to avoid overfitting

In [38]:
model = models.Sequential()   #it is simply stack of layer with single input and single output layer
model.add(layers.Dense(512, input_shape=(max_words,)))    #512 is the dimention
model.add(layers.Activation('relu'))
model.add(layers.Dropout(drop_ratio))
model.add(layers.Dense(num_classes))   #output layer
model.add(layers.Activation('softmax'))

model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
#loss: mse,binarycrossentropy
#model: adam, SGD, RMSprop

In [39]:
history = model.fit(x_train, y_train,batch_size=batch_size,epochs=epochs,verbose=1,validation_split=0.1)

Epoch 1/2
Epoch 2/2


In [40]:
score = model.evaluate(x_test, y_test,batch_size=batch_size, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.15797629952430725
Test accuracy: 0.9370786547660828


# Make some predictions:

In [183]:
text_labels = encoder.classes_ 
for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_text.iloc[i][:50], "...")
    print('Actual label:' + test_cat.iloc[i])
    print("Predicted label: " + predicted_label + "\n")  

hobbit picture  four years away  lord of the rings ...
Actual label:entertainment
Predicted label: entertainment

game firm holds  cast  auditions video game firm b ...
Actual label:tech
Predicted label: tech

clarke plans migrant point scheme anyone planning  ...
Actual label:politics
Predicted label: politics

radcliffe will compete in london paula radcliffe w ...
Actual label:sport
Predicted label: sport

serena becomes world number two serena williams ha ...
Actual label:sport
Predicted label: sport

ultimate game  award for doom 3 sci-fi shooter doo ...
Actual label:tech
Predicted label: tech

algeria hit by further gas riots algeria suffered  ...
Actual label:business
Predicted label: business

fast lifts rise into record books two high-speed l ...
Actual label:tech
Predicted label: entertainment

muslim group attacks tv drama 24 a british muslim  ...
Actual label:entertainment
Predicted label: entertainment

us tv special for tsunami relief a us television n ...
Actual label:ent