In [1]:
import glob
import os
import re
import pandas as pd

In [2]:
document=[]
for doc in glob.glob('C:/Users/sirola/Desktop/Insofe/CUTe4/Data/*'):
    with(open(doc, 'r')) as f:
        text = f.readlines()
        text = ''.join(text)
        text=re.sub('<.*>','',text)
        text=re.sub('\'','',text)
        text=re.sub('\n|\t',' ',text)
        text=re.sub('[\d]','',text)
        text=re.sub('[$]','',text) 
        text=re.sub('[^(a-z|A-Z|\s)]','',text)        
    document.append(text)

In [3]:
label=[]
for labels in glob.glob('C:/Users/sirola/Desktop/Insofe/CUTe4/Data/*'):
    tclass=re.split('_',labels)[1]
    label.append(tclass)  

In [4]:
data=pd.DataFrame([document,label]).T
data.columns=['Document','Target']
data.head(5)

Unnamed: 0,Document,Target
0,In article freddshuksan (Fred Dickey) write...,2
1,In article gwmsplsplloralcom (Gary W Mahan)...,2
2,THANKS TO ALL OF YOU WHO RESPONDED TO MY POS...,2
3,The subject says it all My Chev S Pickups l...,2
4,NNTPPostingHost blackercaltechedu wolfsonre...,2


In [5]:
import numpy as np  # Numpy library for creating and modifying arrays.


# Print the unique classes and their counts/frequencies
classes = np.unique(data['Target'], return_counts=True) # np.unique returns a tuple with class names and counts
print(classes[0]) #Print the list of unique classes
print(classes[1]) #Print the list of frequencies of the above classes


['1' '2' '3' '4' '5' '6']
[1000 1006  999  989  997  993]


In [6]:
train=data.sample(frac=0.7)
test=data.drop(train.index)

train.shape
train.head()

Unnamed: 0,Document,Target
1947,frankDSuucp (Frank ODwyer) writes In artic...,1
2644,MessageID References NNTPPostingHost dolph...,3
330,Distribution world MessageID References R...,2
3584,In article rindenterprisebihharvardedu (Davi...,4
2530,I just bought a little gizmo that is suppo...,3


In [7]:
max_num_words = 10000
seq_len = 50
embedding_size = 100

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_num_words) #Tokenizer is used to tokenize text
tokenizer.fit_on_texts(train.Document) #Fit this to our corpus

x_train = tokenizer.texts_to_sequences(train.Document) #'text to sequences converts the text to a list of indices
x_train = pad_sequences(x_train, maxlen=50) #pad_sequences makes every sequence a fixed size list by padding with 0s 

x_test = tokenizer.texts_to_sequences(test.Document) 
x_test = pad_sequences(x_test, maxlen=50)

x_train.shape, x_test.shape # Check the dimensions of x_train and x_test .shape, x_test.shape # Check the dimensions of x_train and x_test  


((4189, 50), (1795, 50))

In [10]:
unique_labels = list(train.Target.unique())
print(unique_labels)

['1', '3', '2', '4', '5', '6']


In [11]:
from keras.utils import to_categorical # This convers the labels to one-hot vectors(Dummies)

y_train = np.array([unique_labels.index(i) for i in train.Target]) # Convert the word labels to indeces
y_train = to_categorical(y_train) # Dummify the labels
y_test = np.array([unique_labels.index(i) for i in test.Target])
y_test = to_categorical(y_test)

In [12]:
from keras.layers import Dense, SimpleRNN, GRU, LSTM, Embedding,Dropout # Import layers from Keras
from keras.models import Sequential
# Building an LSTM model
model = Sequential() # Call Sequential to initialize a network
model.add(Embedding(input_dim = max_num_words, 
                    input_length = seq_len, 
                    output_dim = embedding_size)) # Add an embedding layer which represents each unique token as a vector
model.add(LSTM(10, return_sequences=True))# Add an LSTM layer
model.add(Dropout(0.5))
model.add(LSTM(10, return_sequences=False))
model.add(Dropout(0.5))

model.add(Dense(6, activation='softmax')) # Add an ouput layer. Since classification, 3 nodes for 3 classes.

In [13]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           1000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 10)            4440      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 10)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 10)                840       
_________________________________________________________________
dropout_2 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 66        
Total params: 1,005,346
Trainable params: 1,005,346
Non-trainable params: 0
_________________________________________________________________


In [14]:
from keras.optimizers import Adam
adam = Adam(lr=0.001)

In [15]:
# Mention the optimizer, Loss function and metrics to be computed
model.compile(optimizer=adam,                  # 'Adam' is a variant of gradient descent technique
              loss='categorical_crossentropy', # categorical_crossentropy for multi-class classification
              metrics=['accuracy'])            # These metrics are computed for evaluating and stored in history

model.fit(x_train, y_train, epochs=5, validation_split=0.25)

Train on 3141 samples, validate on 1048 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x26f1c075f60>

In [16]:
test_prob = model.predict(x_test)
test_prob.shape

(1795, 6)

In [17]:
test_prob[:6]

array([[0.00446592, 0.07487506, 0.52626747, 0.0189277 , 0.07338848,
        0.30207533],
       [0.00167764, 0.04531724, 0.47906646, 0.00192823, 0.0145673 ,
        0.45744315],
       [0.00267257, 0.05991123, 0.49982387, 0.00435177, 0.0245114 ,
        0.40872914],
       [0.07071646, 0.22765464, 0.22574715, 0.11254576, 0.16830523,
        0.1950307 ],
       [0.0016872 , 0.03129255, 0.55569595, 0.00316668, 0.02708302,
        0.3810746 ],
       [0.00943799, 0.03809113, 0.46868163, 0.01825872, 0.14203596,
        0.32349455]], dtype=float32)

In [18]:
test_classes = model.predict_classes(x_test)
test_classes.shape

(1795,)

In [19]:
test_classes = np.argmax(test_prob, axis=1)
test_classes.shape

(1795,)