In [1]:
import pandas as pd
import numpy as np

In [2]:
bbc_mixed=pd.read_csv('bbc_news_mixed.csv')
bbc_mixed.head()

Unnamed: 0,text,label
0,Cairn shares slump on oil setback\n\nShares in...,business
1,Egypt to sell off state-owned bank\n\nThe Egyp...,business
2,Cairn shares up on new oil find\n\nShares in C...,business
3,Low-cost airlines hit Eurotunnel\n\nChannel Tu...,business
4,"Parmalat to return to stockmarket\n\nParmalat,...",business


## Performing a text classification on the given dataset, using simple neural networks
- Before performing text classification, we need to preprocess the text data and then pass it to our neural network
- first we need to label encode our target column 'Label' 
- we need to convert the text column to sequence of tokens 
- padding the sequences to make uniform length

In [3]:
#Label encoding 
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

labelcode=LabelEncoder()

bbc_mixed.label=labelcode.fit_transform(bbc_mixed.label)

y=to_categorical(bbc_mixed.label)

Using TensorFlow backend.


In [4]:
y

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]], dtype=float32)

In [5]:
#converting the text into sequence of tokens

from sklearn.model_selection  import train_test_split

from keras.preprocessing.text import Tokenizer


x_train,x_test,y_train,y_test=train_test_split(bbc_mixed['text'],y,test_size=0.2,random_state=42)
total_x=x_train.append(x_test)


tokenizer=Tokenizer()
tokenizer.fit_on_texts(total_x)

#converting text to sequence of tokens
x_train_tokens=tokenizer.texts_to_sequences(x_train)
x_test_tokens=tokenizer.texts_to_sequences(x_test)

# calculate maximum length of sequence and vocab size
max_len=total_x.str.split().apply(lambda x: len(x)).max()
vocab_size = len(tokenizer.word_index)+1

In [6]:
#padding sequences to uniform length
from keras.preprocessing.sequence import pad_sequences

x_train_pad=pad_sequences(x_train_tokens,maxlen=max_len,padding='post')
x_test_pad=pad_sequences(x_test_tokens,maxlen=max_len,padding='post')
print(x_train_pad.shape)
print(x_test_pad.shape)

(1780, 4432)
(445, 4432)


In [7]:
vocab_size 

32360

In [8]:
from keras.models import Sequential
from keras.layers import Dense,Flatten,Embedding

#embedding size
embedding_size=100
vocab_100=int(vocab_size/100)

#initializing the model
model=Sequential()
model.add(Embedding(vocab_size,embedding_size,input_length=max_len))
model.add(Dense(500,activation='relu'))
model.add(Dense(vocab_100,activation='relu'))
model.add(Flatten())

#add final layer with 5 outputs
model.add(Dense(5,activation='softmax'))

#compiling the model
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [9]:
#checking the models summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4432, 100)         3236000   
_________________________________________________________________
dense_1 (Dense)              (None, 4432, 500)         50500     
_________________________________________________________________
dense_2 (Dense)              (None, 4432, 323)         161823    
_________________________________________________________________
flatten_1 (Flatten)          (None, 1431536)           0         
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 7157685   
Total params: 10,606,008
Trainable params: 10,606,008
Non-trainable params: 0
_________________________________________________________________


In [10]:
#now we can evaluate the model, as the model is trained
model.fit(x_train_pad,y_train,epochs=3,validation_data=[x_test_pad,y_test])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1780 samples, validate on 445 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x1c41e970b08>