<a href="https://colab.research.google.com/github/Akashpandey237/Akashpandey237-Data-Augmentation-in-Deep-Learning/blob/main/seq2seqmodeltextsumm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Set up Kaggle API credentials (replace with your own)
!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [3]:
# Download dataset from Kaggle
!kaggle datasets download -d gowrishankarp/newspaper-text-summarization-cnn-dailymail
!unzip newspaper-text-summarization-cnn-dailymail.zip

# Load data into DataFrames
import pandas as pd

train_df = pd.read_csv('/content/cnn_dailymail/train.csv')
test_df = pd.read_csv('/content/cnn_dailymail/test.csv')

# Preprocess text data (tokenization, padding, etc.)
# ... (Fill in with your specific preprocessing steps)

Downloading newspaper-text-summarization-cnn-dailymail.zip to /content
 98% 495M/503M [00:04<00:00, 127MB/s]
100% 503M/503M [00:04<00:00, 119MB/s]
Archive:  newspaper-text-summarization-cnn-dailymail.zip
  inflating: cnn_dailymail/test.csv  
  inflating: cnn_dailymail/train.csv  
  inflating: cnn_dailymail/validation.csv  


In [9]:
# Import necessary libraries
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, Dense, Embedding, Attention
# ... (Add any other required layers)
vocab_size = 10000
embedding_dim = 128
hidden_units = 128
# Define encoder-decoder architecture with attention
# Example using LSTM layers (adapt for GRU if preferred)
max_input_length = 1000  # Replace with the actual maximum length
# Encoder
encoder_inputs = Input(shape=(max_input_length,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units=hidden_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
max_output_length = 50  # Replace with the actual maximum length
decoder_inputs = Input(shape=(max_output_length,))
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(units=hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Attention layer
attention = Attention()([decoder_outputs, encoder_outputs])

# Concatenate attention output and decoder output
decoder_concat_output = Concatenate(axis=-1)([decoder_outputs, attention])

# Dense layer for prediction
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_output)


import tensorflow as tf
tf.keras.backend.clear_session()

# Compile the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [10]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1000)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 50)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 1000, 128)            1280000   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 50, 128)              1280000   ['input_2[0][0]']             
                                                                                              

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Replace these with your actual data
encoder_input_texts = ["your", "encoder", "input", "texts"]
decoder_input_texts = ["your", "decoder", "input", "texts"]
decoder_target_texts = ["your", "decoder", "target", "texts"]

# Set your maximum sequence lengths
max_encoder_seq_length = 1000  # Replace with the actual maximum length for encoder sequences
max_decoder_seq_length = 50   # Replace with the actual maximum length for decoder sequences

# Tokenize and pad sequences for encoder input
encoder_tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
encoder_tokenizer.fit_on_texts(encoder_input_texts)
encoder_input_data = pad_sequences(encoder_tokenizer.texts_to_sequences(encoder_input_texts), maxlen=max_encoder_seq_length, padding='post')

# Tokenize and pad sequences for decoder input and target
decoder_tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
decoder_tokenizer.fit_on_texts(decoder_input_texts + decoder_target_texts)
decoder_input_data = pad_sequences(decoder_tokenizer.texts_to_sequences(decoder_input_texts), maxlen=max_decoder_seq_length, padding='post')
decoder_target_data = pad_sequences(decoder_tokenizer.texts_to_sequences(decoder_target_texts), maxlen=max_decoder_seq_length, padding='post')

# Convert decoder target data to one-hot encoding
decoder_target_data = to_categorical(decoder_target_data, num_classes=vocab_size)

# Train-test split
encoder_input_train, encoder_input_val, decoder_input_train, decoder_input_val, decoder_target_train, decoder_target_val = train_test_split(
    encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.2, random_state=42
)


In [14]:
print(encoder_input_train.shape)
print(decoder_input_train.shape)
print(decoder_target_train.shape)


(3, 1000)
(3, 50)
(3, 50, 10000)


In [15]:
# Train the model
num_epochs = 10  # Replace with the desired number of epochs
batch_size = 32  # Replace with your preferred batch size
validation_split = 0.2  # Replace with the desired validation split

model.fit(
    [encoder_input_train, decoder_input_train],
    decoder_target_train,
    epochs=num_epochs,
    batch_size=batch_size,
    validation_split=validation_split
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fbd7110e740>

In [16]:
model.evaluate([encoder_input_val, decoder_input_val], decoder_target_val)




[7.343179702758789, 0.9800000190734863]

In [17]:
predictions = model.predict([encoder_input_val, decoder_input_val])




In [18]:
model.save("your_model_name.h5")


  saving_api.save_model(


In [19]:
model.save('your_model_name.keras')


In [20]:
from tensorflow.keras.models import load_model

loaded_model = load_model('your_model_name.keras')


In [21]:
evaluation_metrics = model.evaluate([encoder_input_val, decoder_input_val], decoder_target_val)
print("Evaluation Metrics:", evaluation_metrics)


Evaluation Metrics: [7.343179702758789, 0.9800000190734863]


In [22]:
!ls

cnn_dailymail  newspaper-text-summarization-cnn-dailymail.zip  your_model_name.h5
kaggle.json    sample_data				       your_model_name.keras
