In [38]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,SimpleRNN,Bidirectional,Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("ArticlesMarch2018.csv")
df.head(2)

Unnamed: 0,articleID,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL,articleWordCount
0,5a974697410cf7000162e8a4,By BINYAMIN APPELBAUM,article,"Virtual Coins, Real Resources","['Bitcoin (Currency)', 'Electric Light and Pow...",1,Business,1,2018-03-01 00:17:22,Economy,America has a productivity problem. One explan...,The New York Times,News,https://www.nytimes.com/2018/02/28/business/ec...,1207
1,5a974be7410cf7000162e8af,By HELENE COOPER and ERIC SCHMITT,article,U.S. Advances Military Plans for North Korea,"['United States Defense and Military Forces', ...",1,Washington,11,2018-03-01 00:40:01,Asia Pacific,The American military is looking at everything...,The New York Times,News,https://www.nytimes.com/2018/02/28/world/asia/...,1215


In [4]:
df.shape

(1385, 15)

In [6]:
snippet = "\n".join(df['snippet'])
print(snippet)

America has a productivity problem. One explanation may be the growing use of real resources to make virtual products.
The American military is looking at everything from troop rotations to surveillance to casualty evacuations should it be ordered to take action against North Korea.
Can you guess which man is the model public servant?
Censors swung into action after Mr. Xi’s bid to become leader for life resurrected memories of Mao’s personality cult and the feverish emotions that it created.
Apollo, the private equity firm, and Citigroup made large loans last year to the family real estate business of Jared Kushner, President Trump’s senior adviser.
China has sent a top economic adviser to the United States to restore dialogue and quash a trade war. He faces long odds.
The president mixed facts and falsehoods while discussing gun policy and potential solutions with legislators.
Timothy Polin gives us choices.
Susan Wu, a Silicon Valley entrepreneur, has opened a school in Australia.
R

In [13]:
corpus = snippet.lower().split("\n")
corpus[0:3]

['america has a productivity problem. one explanation may be the growing use of real resources to make virtual products.',
 'the american military is looking at everything from troop rotations to surveillance to casualty evacuations should it be ordered to take action against north korea.',
 'can you guess which man is the model public servant?']

In [12]:
len(corpus)

1385

In [14]:
# Initialize the Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

# Get the word index dictionary
word_index = tokenizer.word_index
total_unique_words = len(word_index) + 1  # +1 for padding

print("Vocabulary size:", total_unique_words)
print("Word index:", word_index)

Vocabulary size: 6863


In [18]:
# Generate input sequences
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_seq = token_list[:i + 1]
        input_sequences.append(n_gram_seq)

# Pad sequences to make them of the same length
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
print(len(input_sequences))
print(max_sequence_len)

26937
41


In [20]:
# Prepare the input and output for the model
x_values, labels = input_sequences[:, :-1], input_sequences[:, -1]

# One-hot encode the output labels
y_values = tf.keras.utils.to_categorical(labels, num_classes=total_unique_words)

print("X values (first 3):", x_values[:3])
print("Y values (first 3):", y_values[:3])

X values (first 3): [[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0 193]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0 193  14]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0 193  14   2]]
Y values (first 3): [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [22]:
path = 'glove.txt'
embeddingsmatrix = {}
with open(path,encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.array(values[1:],dtype = 'float32')
        embeddingsmatrix[word] = coeffs

In [24]:
from numpy import dot
from numpy.linalg import norm

In [26]:
glovedict = dict(list(embeddingsmatrix.items()))

In [29]:
a = glovedict['school']
b = glovedict['college']
cosinesimilarity = dot(a,b)/(norm(a)*norm(b))
print(cosinesimilarity)

0.93449956


In [35]:
embeddingsmatrix1 = np.zeros((total_unique_words,50))
for word,i in word_index.items():
    embeddingvector = embeddingsmatrix.get(word)
    if embeddingvector is not None:
        embeddingsmatrix1[i] = embeddingvector
        
    

In [36]:
embeddings_matrix = pd.DataFrame(embeddingsmatrix1,index = ['']+list(word_index.keys()))

In [37]:
embeddings_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the,0.418,0.24968,-0.41242,0.1217,0.34527,-0.044457,-0.49688,-0.17862,-0.00066,-0.6566,...,-0.29871,-0.15749,-0.34758,-0.045637,-0.44251,0.18785,0.002785,-0.18411,-0.11514,-0.78581
a,0.21705,0.46515,-0.46757,0.10082,1.0135,0.74845,-0.53104,-0.26256,0.16812,0.13182,...,0.13813,0.36973,-0.64289,0.024142,-0.039315,-0.26037,0.12017,-0.043782,0.41013,0.1796
of,0.70853,0.57088,-0.4716,0.18048,0.54449,0.72603,0.18157,-0.52393,0.10381,-0.17566,...,-0.34727,0.28483,0.075693,-0.062178,-0.38988,0.22902,-0.21617,-0.22562,-0.093918,-0.80375
to,0.68047,-0.039263,0.30186,-0.17792,0.42962,0.032246,-0.41376,0.13228,-0.29847,-0.085253,...,-0.094375,0.018324,0.21048,-0.03088,-0.19722,0.082279,-0.09434,-0.073297,-0.064699,-0.26044


In [53]:
model = Sequential([
    Embedding(input_dim=total_unique_words,output_dim=50, weights=[embeddingsmatrix1],input_length=max_sequence_len - 1,trainable=False),Bidirectional(LSTM(256,return_sequences=True)),Dropout(0.2),Bidirectional(LSTM(256)),Dropout(0.2),Dense(128,activation='relu'),Dense(total_unique_words,activation='softmax')])



In [54]:
model.compile(optimizer=Adam(learning_rate=0.001),loss='categorical_crossentropy',metrics=['accuracy'])

In [66]:
# Train the model
history = model.fit(x_values, y_values, epochs=100,validation_split=0.2,verbose=1,batch_size=256)

Epoch 1/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 1s/step - accuracy: 0.0874 - loss: 5.7217 - val_accuracy: 0.0876 - val_loss: 8.8831
Epoch 2/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 1s/step - accuracy: 0.0915 - loss: 5.6192 - val_accuracy: 0.0857 - val_loss: 9.0989
Epoch 3/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 1s/step - accuracy: 0.0937 - loss: 5.4923 - val_accuracy: 0.0876 - val_loss: 9.3346
Epoch 4/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 1s/step - accuracy: 0.0965 - loss: 5.3796 - val_accuracy: 0.0895 - val_loss: 9.3901
Epoch 5/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 1s/step - accuracy: 0.0972 - loss: 5.2794 - val_accuracy: 0.0885 - val_loss: 9.5585
Epoch 6/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 2s/step - accuracy: 0.0995 - loss: 5.2057 - val_accuracy: 0.0885 - val_loss: 9.8885
Epoch 7/100
[1m85/85[0m [32m

[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 1s/step - accuracy: 0.5157 - loss: 1.9003 - val_accuracy: 0.0492 - val_loss: 19.0954
Epoch 52/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 1s/step - accuracy: 0.5281 - loss: 1.8539 - val_accuracy: 0.0479 - val_loss: 19.3253
Epoch 53/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 1s/step - accuracy: 0.5377 - loss: 1.8035 - val_accuracy: 0.0471 - val_loss: 19.5588
Epoch 54/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 1s/step - accuracy: 0.5368 - loss: 1.7835 - val_accuracy: 0.0501 - val_loss: 19.7916
Epoch 55/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 1s/step - accuracy: 0.5442 - loss: 1.7506 - val_accuracy: 0.0477 - val_loss: 19.8393
Epoch 56/100
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 1s/step - accuracy: 0.5578 - loss: 1.7148 - val_accuracy: 0.0471 - val_loss: 19.9079
Epoch 57/100
[1m85/85[0m [

In [67]:
model.summary()

In [68]:
def predict(text, nextwords):
    for i in range(nextwords):
        token_list = tokenizer.texts_to_sequences([text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
        output_word = tokenizer.sequences_to_texts([[predicted[0]]])
        text += ' '+output_word[0]
    print(text)

In [69]:
predict("productivity problem", 12)

productivity problem to use oodles and understand this what can fix it can it
