In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Newsdataset.csv')
df.head()

Unnamed: 0,Sr. No,Newspaper Name,Published Date,URL,Headline,Content,Human Summary,Category
0,1,The Hindu,2023-12-01,https://www.thehindu.com/news/national/sample-...,"""India Launches Chandrayaan-4 Successfully""","India successfully launched Chandrayaan-4, aim...",India launched Chandrayaan-4 to study the moon...,Science and Technology
1,2,Hindustan Times,2022-08-15,https://www.hindustantimes.com/india/sample-ne...,"""PM Announces Digital India 2.0 on Independenc...",The Prime Minister unveiled the Digital India ...,"PM launched Digital India 2.0, focusing on tec...",National News
2,3,Indian Express,2021-04-10,https://www.indianexpress.com/news/sample-news-3,"""Economic Growth Rebounds in Q1 2021""",India’s GDP showed a rebound in the first quar...,"India's Q1 2021 GDP rebounded, indicating a re...",Business and Finance
3,4,The Telegraph,2023-05-18,https://www.telegraphindia.com/nation/sample-n...,"""Cyclone Yaas Causes Widespread Damage in East...",Cyclone Yaas wreaked havoc in Odisha and West ...,Cyclone Yaas caused severe damage in Eastern I...,Environment
4,5,Deccan Chronicle,2020-10-05,https://www.deccanchronicle.com/nation/sample-...,"""Hyderabad Emerges as India’s Vaccine Hub""",Hyderabad became a central hub for COVID-19 va...,Hyderabad gained recognition as the COVID-19 v...,Health and Wellness


In [6]:
df1 = df[['Content','Human Summary']]
df1.head()

Unnamed: 0,Content,Human Summary
0,"India successfully launched Chandrayaan-4, aim...",India launched Chandrayaan-4 to study the moon...
1,The Prime Minister unveiled the Digital India ...,"PM launched Digital India 2.0, focusing on tec..."
2,India’s GDP showed a rebound in the first quar...,"India's Q1 2021 GDP rebounded, indicating a re..."
3,Cyclone Yaas wreaked havoc in Odisha and West ...,Cyclone Yaas caused severe damage in Eastern I...
4,Hyderabad became a central hub for COVID-19 va...,Hyderabad gained recognition as the COVID-19 v...


In [13]:
df1['Human Summary'] = df1['Human Summary'].apply(lambda x: '<start> ' + x + ' <end>')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Human Summary'] = df1['Human Summary'].apply(lambda x: '<start> ' + x + ' <end>')


In [14]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model
import numpy as np

def prepare_data(texts, summaries, vocab_size=10000, max_text_len=100, max_summary_len=20):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(texts + summaries)
    
    text_sequences = tokenizer.texts_to_sequences(texts)
    summary_sequences = tokenizer.texts_to_sequences(summaries)
    
    text_padded = tf.keras.preprocessing.sequence.pad_sequences(
        text_sequences, maxlen=max_text_len, padding='post')
    summary_padded = tf.keras.preprocessing.sequence.pad_sequences(
        summary_sequences, maxlen=max_summary_len, padding='post')
    
    return text_padded, summary_padded, tokenizer

In [4]:
# Build Seq2Seq model
def build_seq2seq_model(vocab_size, embedding_dim=100, lstm_units=128, max_text_len=100, max_summary_len=20):
    # Encoder
    encoder_inputs = Input(shape=(max_text_len,))
    encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
    encoder_lstm, state_h, state_c = LSTM(lstm_units, return_state=True)(encoder_embedding)
    encoder_states = [state_h, state_c]
    
    # Decoder
    decoder_inputs = Input(shape=(max_summary_len,))
    decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
    decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
    
    # Model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    # Inference models
    encoder_model = Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = Input(shape=(lstm_units,))
    decoder_state_input_c = Input(shape=(lstm_units,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    
    decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return model, encoder_model, decoder_model


In [5]:
# Inference function
def decode_sequence(input_seq, encoder_model, decoder_model, tokenizer, max_summary_len=20):
    states_value = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['<start>']  # Assume <start> token exists
    
    stop_condition = False
    decoded_sentence = []
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = None
        for word, index in tokenizer.word_index.items():
            if index == sampled_token_index:
                sampled_word = word
                break
        
        if sampled_word is None or sampled_word == '<end>' or len(decoded_sentence) > max_summary_len:
            stop_condition = True
        else:
            decoded_sentence.append(sampled_word)
        
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
    
    return ' '.join(decoded_sentence)

In [8]:
text_padded, summary_padded, tokenizer = prepare_data(df1['Content'], df1['Human Summary'])

In [12]:
# Build and train model
model, encoder_model, decoder_model = build_seq2seq_model(len(tokenizer.word_index) + 1)
model.fit([text_padded, summary_padded[:, :-1]], summary_padded[:, 1:],
              batch_size=64, epochs=10, validation_split=0.2)



Epoch 1/10


ValueError: in user code:

    File "C:\Users\cappr\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\cappr\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\cappr\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\cappr\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1150, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\cappr\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\cappr\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 1 of layer "model" is incompatible with the layer: expected shape=(None, 20), found shape=(None, 19)
