In [8]:
# import libraries
from python_functions import *
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.utils import to_categorical
import gensim

In [3]:
def load_data(data_filepath):
    
    """
        This function takes in the database path and it reads the data
        
        
        return: The messages which are the predators, the response i.e the labels as a dataframe 
            and an array of all the labels
    """
    df=pd.read_csv(data_filepath, sep=",", header=0)
    new_df=df[df['sent_length']>20].dropna(subset=['clean_headline']).reset_index(drop=True)
    
    X_data_set=new_df['clean_headline']#.tolist()
    Y_data_set = new_df.drop(['clean_headline','sent_length'], axis=1)
    label_name=Y_data_set.columns
    return X_data_set,Y_data_set,label_name

In [4]:
X, Y, category_names=load_data('../model_df.csv')

### Python functions for NLP pipeline

In [5]:
class MyCorpus:
    """An iterator that yields sentences (lists of str)."""
    
    def __init__(self,df):
        self.df=df
    
    def __iter__(self):
        
        for line in self.df:
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

In [6]:
sentences = MyCorpus(X)
sentences

<__main__.MyCorpus at 0x12918a717c0>

In [9]:
# Training word2vec model
model = gensim.models.Word2Vec(sentences=sentences,min_count=2,vector_size=200)

In [10]:
# Store just the words + their trained embeddings.
word_vectors = model.wv

word_vectors.save_word2vec_format('test_w2v.txt', binary=False)

In [11]:
# Load a word2vec model stored in the C *text* format.

wv_from_text = KeyedVectors.load_word2vec_format("test_w2v.txt", binary=False)

In [12]:
with open('test_w2v.txt') as f:
    data1 =f.readlines()[1:]

## Training Machine learning models

In [13]:
embedding_dim = 200                                          
max_words = 8000

# apply the vectors provided by Word2Vec to create a word embedding matrix
print("Applying Word2vec vectors...")


embeddings_index = {}

for line in data1:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    
print('Found %s word vectors.' % len(embeddings_index))

Applying Word2vec vectors...
Found 10816 word vectors.


In [14]:
## Creating dictionary of the label and category number

news_labels=Y['category'].unique()
news_labels_dict={}
for index in range(len(news_labels)):
    news_labels_dict[news_labels[index]]=index    

print("Loading Narative components data...")

headline =X

## Changing label to categorical values
labels = Y['category'].apply(lambda x: news_labels_dict[x])
print("Loading Narative components data completed.")

# Split data into training and test sets
X_train, X_test, Y_train, y_test = train_test_split(headline,labels,test_size=0.3,random_state=42)

## Calculation of the class weight
weights= compute_class_weight(
           'balanced',
            np.unique(Y_train), 
            Y_train)

weights_dict = dict(zip( np.unique(Y_train),weights))

y_train_data = to_categorical(Y_train)
y_test_dat = to_categorical(y_test)



Loading Narative components data...
Loading Narative components data completed.


1332     2
9129     1
5149     0
7293     1
        ..
11964    3
21575    1
5390     0
860      3
15795    3
Name: category, Length: 15380, dtype: int64 as keyword args. From version 0.25 passing these as positional arguments will result in an error


In [15]:
# use the Tokenizer from Keras to "learn" a vocabulary from the entire car components text
print("Tokenizing data...")    

tokenizer = Tokenizer(num_words=max_words)

tokenizer.fit_on_texts(X_train)
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# Padding the dataset to have same dimention of the embedding matrix
x_train_seq = pad_sequences(train_sequences, maxlen=embedding_dim)
x_test_seq = pad_sequences(test_sequences, maxlen=embedding_dim)

labels = np.asarray(labels, dtype='float32')
print('Shape of data tensor:', x_train_seq.shape)
print('Shape of label tensor:', labels.shape)


print("Tokenizing data complete.")

## Applying word embedding 
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector    
print("Applying word2vec vectors completed.")

Tokenizing data...
Found 14825 unique tokens.
Shape of data tensor: (15380, 200)
Shape of label tensor: (21972,)
Tokenizing data complete.
Applying word2vec vectors completed.


## Deep Neural Network

In [16]:
from tensorflow.keras.layers import Embedding, Flatten, Dense
# use Keras to define the structure of the deep neural network   
early_stop=EarlyStopping(monitor='acc',mode='max',verbose=1,patience=5,min_delta=0.01)
print("Creating model structure...")

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=embedding_dim))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(60, activation='relu'))
model.add(Dense(4, activation='sigmoid'))
model.summary()

# fix the weights for the first layer to those provided by the embedding matrix
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
print("Creating model structure completed.")

opt = optimizers.RMSprop(lr=0.001)

print("Training model...")

model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['acc'])

history = model.fit(x_train_seq, y_train_data, epochs=10,
                    batch_size=100, validation_split=0.3,
                    callbacks=[early_stop],class_weight=weights_dict)
print(history)
print("Training model completed.")
print('Model evaluation will print the following metrics: ', model.metrics_names)
evaluation_metrics = model.evaluate(x_test_seq, y_test_dat)
print(evaluation_metrics)

Creating model structure...
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 200)          1600000   
_________________________________________________________________
flatten (Flatten)            (None, 40000)             0         
_________________________________________________________________
dense (Dense)                (None, 100)               4000100   
_________________________________________________________________
dense_1 (Dense)              (None, 60)                6060      
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 244       
Total params: 5,606,404
Trainable params: 5,606,404
Non-trainable params: 0
_________________________________________________________________
Creating model structure completed.
Training model...
Epoch 1/10
Epoch 2/10
Epoch 3/

## LSTM Neural Network

In [17]:
print("Creating model structure...")

early_stop=EarlyStopping(monitor='val_acc',mode='max',verbose=1,patience=5,min_delta=0.01)
deep_inputs = Input(shape=(embedding_dim,))
embedding_layer = Embedding(max_words, embedding_dim, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(85,return_sequences=True)(embedding_layer)
LSTM_Layer_2 = LSTM(50)(LSTM_Layer_1)
dense_layer_1 = Dense(4, activation='softmax')(LSTM_Layer_2)
model1 = Model(inputs=deep_inputs, outputs=dense_layer_1)

model1.summary()

print("Creating model structure completed.")

print("Training model...")
model1.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])



Creating model structure...
Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 200)          1600000   
_________________________________________________________________
lstm (LSTM)                  (None, 200, 85)           97240     
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                27200     
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 204       
Total params: 1,724,644
Trainable params: 124,644
Non-trainable params: 1,600,000
_________________________________________________________________
Creating model structure completed.
Training model...


In [18]:
history2 = model1.fit(x_train_seq, y_train_data, epochs=10,
                    batch_size=100, validation_split=0.3,
                    callbacks=[early_stop],class_weight=weights_dict)

print(history2)
print("Training model completed.")
print('Model evaluation will print the following metrics: ', model1.metrics_names)
evaluation_metrics2 = model1.evaluate(x_test_seq, y_test_dat)
print(evaluation_metrics2)

print("Testing model skill.....")
predictions = model1.predict(x_test_seq).argmax(axis=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
<tensorflow.python.keras.callbacks.History object at 0x0000012921547A00>
Training model completed.
Model evaluation will print the following metrics:  ['loss', 'acc']
[1.3892267942428589, 0.17066140472888947]
Testing model skill.....
