<a href="https://colab.research.google.com/github/2303A51376/Natural-Language-Processing/blob/main/NLP_PROJECT_1376.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import pickle
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/content/Shakespeare_data.csv')
print(data.head())

   Dataline      Play  PlayerLinenumber ActSceneLine         Player  \
0         1  Henry IV               NaN          NaN            NaN   
1         2  Henry IV               NaN          NaN            NaN   
2         3  Henry IV               NaN          NaN            NaN   
3         4  Henry IV               1.0        1.1.1  KING HENRY IV   
4         5  Henry IV               1.0        1.1.2  KING HENRY IV   

                                          PlayerLine  
0                                              ACT I  
1                       SCENE I. London. The palace.  
2  Enter KING HENRY, LORD JOHN OF LANCASTER, the ...  
3             So shaken as we are, so wan with care,  
4         Find we a time for frighted peace to pant,  


In [None]:

text = []
for i in data['PlayerLine']:
    text.append(i)

text[:5]

['ACT I',
 'SCENE I. London. The palace.',
 'Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others',
 'So shaken as we are, so wan with care,',
 'Find we a time for frighted peace to pant,']

In [None]:

def clean_text(text):
    pattern = re.compile('[^a-zA-z0-9\s]')
    text = re.sub(pattern,'',text)
    pattern = re.compile('\d+')
    text = re.sub(pattern,'',text)

    text = text.lower()
    return text

texts = []
for t in text:
    new_text = clean_text(t)
    texts.append(new_text)

texts[:5]

['act i',
 'scene i london the palace',
 'enter king henry lord john of lancaster the earl of westmoreland sir walter blunt and others',
 'so shaken as we are so wan with care',
 'find we a time for frighted peace to pant']

In [None]:
# lets take first 10000 words for the model training
texts = texts[:10000]

# using tensorflow tokenizer and
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# generating text sequences, i.e. encoding the text
text_sequences = tokenizer.texts_to_sequences(texts) # Remove np.array()
print('Text -->>',texts[0])
print('Embedding -->>',text_sequences[0])

# padding the sequences
Max_Sequence_Len = max([len(x) for x in text_sequences])
text_sequences = pad_sequences(text_sequences,
                               maxlen = Max_Sequence_Len, padding='pre') # pad_sequences expects a list

print('Maximum Sequence Length -->>',Max_Sequence_Len)
print('Text Sequence -->>\n',text_sequences[0])
print('Text Sequence Shape -->>',text_sequences.shape)

# This code is modified by Susobhan Akhuli

Text -->> act i
Embedding -->> [455, 4]
Maximum Sequence Length -->> 54
Text Sequence -->>
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 455   4]
Text Sequence Shape -->> (10000, 54)


In [None]:
X, y = text_sequences[:, :-1], text_sequences[:,-1]
print('First Input :',X[0])
print('First Target :',y[0])
word_index = tokenizer.word_index
total_words = len(word_index) + 1
print('Total Number of Words:',total_words)
y = to_categorical(y, num_classes=total_words)
print('Input Shape :',X.shape)
print('Target Shape :',y.shape)

First Input : [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 455]
First Target : 4
Total Number of Words: 7865
Input Shape : (10000, 53)
Target Shape : (10000, 7865)


In [None]:
# Define the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=Max_Sequence_Len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))

model.summary()

In [None]:
def autoCompletations(text, model):
    text_sequences = np.array(tokenizer.texts_to_sequences([text]))

    testing = pad_sequences(text_sequences, maxlen = Max_Sequence_Len-1, padding='pre')

    y_pred_test = np.argmax(model.predict(testing,verbose=0))

    predicted_word = ''
    for word, index in tokenizer.word_index.items():
        if index == y_pred_test:
            predicted_word = word
            break
    text += " " + predicted_word + '.'
    return text

complete_sentence = autoCompletations('I have seen this', model)
complete_sentence

'I have seen this render.'

In [None]:
def generate_text(text, new_words):
    for _ in range(new_words):
        text = autoCompletations(text, model)[:-1]
    return text

generated_text = generate_text('I have seen', 5)
generated_text

'I have seen render sceptres sceptres neglected wicked'

In [1]:
!pip install transformers torch
from transformers import pipeline




In [2]:
# Create a fill-mask pipeline using BERT
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

# Test sentence with a masked word
sentence = "Artificial intelligence will [MASK] the world."

# Get predictions
results = fill_mask(sentence)

# Display top predictions
for r in results:
    print(f"{r['sequence']}  -->  Score: {r['score']:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cpu


artificial intelligence will change the world.  -->  Score: 0.2798
artificial intelligence will rule the world.  -->  Score: 0.1348
artificial intelligence will control the world.  -->  Score: 0.1072
artificial intelligence will dominate the world.  -->  Score: 0.0642
artificial intelligence will transform the world.  -->  Score: 0.0348


In [4]:
# Use RoBERTa for the same task
fill_mask_roberta = pipeline("fill-mask", model="roberta-base")

sentence = "The future of technology is <mask>."
results = fill_mask_roberta(sentence)

for r in results:
    print(f"{r['sequence']}  -->  Score: {r['score']:.4f}")

Device set to use cpu


The future of technology is uncertain.  -->  Score: 0.1020
The future of technology is here.  -->  Score: 0.0658
The future of technology is now.  -->  Score: 0.0563
The future of technology is clear.  -->  Score: 0.0384
The future of technology is bright.  -->  Score: 0.0255


In [55]:
from bertopic import BERTopic

# Create BERTopic model
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(texts)

# Show topic info
topic_info = topic_model.get_topic_info()
topic_info.head()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,23,0____,"[, , , , , , , , , ]","[, , ]"
1,1,21,1_the_in_of_and,"[the, in, of, and, boulter, he, by, to, was, as]",[ He had a recurring role in 2003 on two episo...


In [53]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Downloading bertopic-0.17.3-py3-none-any.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.17.3


In [56]:
# Display the top 5 topics and their keywords
for topic_id in topic_info['Topic'].head(5):
    print(f"Topic {topic_id}: ", topic_model.get_topic(topic_id))


Topic 0:  [('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05)]
Topic 1:  [('the', np.float64(0.15206393364658563)), ('in', np.float64(0.11688356524992599)), ('of', np.float64(0.08223296965886961)), ('and', np.float64(0.0722868272981208)), ('boulter', np.float64(0.06330967461803796)), ('he', np.float64(0.06330967461803796)), ('by', np.float64(0.05557628869186851)), ('to', np.float64(0.05355354815345628)), ('was', np.float64(0.05355354815345628)), ('as', np.float64(0.05355354815345628))]
