In [2]:
text = '''Here are 10 key points about machine learning (ML):
Definition: Machine learning is a subset of artificial intelligence (AI) that enables systems to learn from data, identify patterns, and make decisions with minimal human intervention.
Types of ML: The three main types of machine learning are:
Supervised learning: The model is trained on labeled data (input-output pairs).
Unsupervised learning: The model finds hidden patterns or intrinsic structures in input data that has no labels.
Reinforcement learning: The model learns by interacting with an environment and receiving feedback in the form of rewards or penalties.
Training and Testing: Machine learning models are trained on a portion of data (training set) and then evaluated on unseen data (test set) to measure their performance.
Feature Engineering: Selecting and transforming relevant data features that help improve the performance of ML models is crucial for accurate predictions.
Overfitting and Underfitting:
Overfitting: The model performs well on training data but poorly on unseen data because it has learned noise or irrelevant details.
Underfitting: The model is too simple and fails to capture the underlying trend in the data.
Algorithms: Common machine learning algorithms include decision trees, support vector machines (SVM), k-nearest neighbors (KNN), neural networks, and ensemble methods like random forests and gradient boosting.
Dimensionality Reduction: Techniques like Principal Component Analysis (PCA) or t-SNE are used to reduce the number of input features while preserving essential information, improving model performance and interpretation.
Evaluation Metrics: Various metrics like accuracy, precision, recall, F1-score, and ROC-AUC are used to evaluate how well a model performs, depending on the problem (classification, regression).
Bias-Variance Tradeoff: The goal is to find the right balance between bias (error due to oversimplification) and variance (error due to complexity) to avoid both overfitting and underfitting.
Real-World Applications: Machine learning is used in various industries for applications like recommendation systems, fraud detection, autonomous driving, medical diagnosis, and weather forecasting.'''

In [3]:
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
tkn = Tokenizer()

In [5]:
tkn.fit_on_texts([text])

In [6]:
tkn.word_index

{'and': 1,
 'the': 2,
 'learning': 3,
 'to': 4,
 'data': 5,
 'of': 6,
 'model': 7,
 'machine': 8,
 'is': 9,
 'on': 10,
 'are': 11,
 'or': 12,
 'in': 13,
 'like': 14,
 'ml': 15,
 'a': 16,
 'that': 17,
 'input': 18,
 'training': 19,
 'performance': 20,
 'overfitting': 21,
 'underfitting': 22,
 'used': 23,
 'systems': 24,
 'patterns': 25,
 'with': 26,
 'types': 27,
 'trained': 28,
 'has': 29,
 'models': 30,
 'set': 31,
 'unseen': 32,
 'features': 33,
 'for': 34,
 'performs': 35,
 'well': 36,
 'algorithms': 37,
 'metrics': 38,
 'various': 39,
 'bias': 40,
 'variance': 41,
 'error': 42,
 'due': 43,
 'applications': 44,
 'here': 45,
 '10': 46,
 'key': 47,
 'points': 48,
 'about': 49,
 'definition': 50,
 'subset': 51,
 'artificial': 52,
 'intelligence': 53,
 'ai': 54,
 'enables': 55,
 'learn': 56,
 'from': 57,
 'identify': 58,
 'make': 59,
 'decisions': 60,
 'minimal': 61,
 'human': 62,
 'intervention': 63,
 'three': 64,
 'main': 65,
 'supervised': 66,
 'labeled': 67,
 'output': 68,
 'pairs':

In [7]:
len(tkn.word_index)

191

In [8]:
input_sequences = []
for sentence in text.split('\n'):
    tokanized_sentences= tkn.texts_to_sequences([sentence])[0]  # Apply tokanizer on sentences
    
    for i in range(1, len(tokanized_sentences)):
        n_gram= tokanized_sentences[:i + 1]
        input_sequences.append(n_gram)


In [9]:
input_sequences

[[45, 11],
 [45, 11, 46],
 [45, 11, 46, 47],
 [45, 11, 46, 47, 48],
 [45, 11, 46, 47, 48, 49],
 [45, 11, 46, 47, 48, 49, 8],
 [45, 11, 46, 47, 48, 49, 8, 3],
 [45, 11, 46, 47, 48, 49, 8, 3, 15],
 [50, 8],
 [50, 8, 3],
 [50, 8, 3, 9],
 [50, 8, 3, 9, 16],
 [50, 8, 3, 9, 16, 51],
 [50, 8, 3, 9, 16, 51, 6],
 [50, 8, 3, 9, 16, 51, 6, 52],
 [50, 8, 3, 9, 16, 51, 6, 52, 53],
 [50, 8, 3, 9, 16, 51, 6, 52, 53, 54],
 [50, 8, 3, 9, 16, 51, 6, 52, 53, 54, 17],
 [50, 8, 3, 9, 16, 51, 6, 52, 53, 54, 17, 55],
 [50, 8, 3, 9, 16, 51, 6, 52, 53, 54, 17, 55, 24],
 [50, 8, 3, 9, 16, 51, 6, 52, 53, 54, 17, 55, 24, 4],
 [50, 8, 3, 9, 16, 51, 6, 52, 53, 54, 17, 55, 24, 4, 56],
 [50, 8, 3, 9, 16, 51, 6, 52, 53, 54, 17, 55, 24, 4, 56, 57],
 [50, 8, 3, 9, 16, 51, 6, 52, 53, 54, 17, 55, 24, 4, 56, 57, 5],
 [50, 8, 3, 9, 16, 51, 6, 52, 53, 54, 17, 55, 24, 4, 56, 57, 5, 58],
 [50, 8, 3, 9, 16, 51, 6, 52, 53, 54, 17, 55, 24, 4, 56, 57, 5, 58, 25],
 [50, 8, 3, 9, 16, 51, 6, 52, 53, 54, 17, 55, 24, 4, 56, 57, 5, 58, 

In [10]:
from tensorflow.keras.utils import pad_sequences

In [11]:
max_len = max([len(i) for i in input_sequences])
max_len

29

In [12]:
padded_input_sequences= pad_sequences(input_sequences, maxlen= max_len, padding= 'pre')
padded_input_sequences

array([[  0,   0,   0, ...,   0,  45,  11],
       [  0,   0,   0, ...,  45,  11,  46],
       [  0,   0,   0, ...,  11,  46,  47],
       ...,
       [  0,   0,   0, ..., 188, 189,   1],
       [  0,   0,   0, ..., 189,   1, 190],
       [  0,   0,   0, ...,   1, 190, 191]])

In [13]:
X = padded_input_sequences[:, : -1] # Only exclude last value
y = padded_input_sequences[:, -1] # Only include last value

In [14]:
from tensorflow.keras.utils import to_categorical
y= to_categorical(y, num_classes= 192)

In [15]:
X.shape, y.shape

((304, 28), (304, 192))

In [16]:
import tensorflow
from tensorflow import keras
from keras import Sequential
from keras.layers import Input, Embedding, LSTM, Dense

In [17]:
model = Sequential([
    Input(shape= (28, )),
    Embedding(input_dim= 192, output_dim= 100),
    LSTM(units= 150),
    Dense(192, activation= 'softmax')
])

model.summary()

In [18]:
model.compile(optimizer= 'adam', loss= 'categorical_crossentropy', metrics= ['accuracy'])

In [19]:
model.fit(X, y, epochs= 100)

Epoch 1/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 49ms/step - accuracy: 0.0270 - loss: 5.2547
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - accuracy: 0.0613 - loss: 5.1543
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.0408 - loss: 5.0007
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.0422 - loss: 4.8226
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.0387 - loss: 4.8550
Epoch 6/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.0604 - loss: 4.7904
Epoch 7/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.0714 - loss: 4.7368
Epoch 8/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.0971 - loss: 4.6286
Epoch 9/100
[1m10/10[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x1cb4dd579d0>

In [20]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assume `model` and `tkn` (the tokenizer) are already defined elsewhere in your code
word = 'patterns'
max_len = 28  # Ensure this matches the max length expected by your model

for i in range(10):
    # Tokenize the input text
    token_text = tkn.texts_to_sequences([word])
    
    # Pad the tokenized sequence to the required max_len
    padded_token_text = pad_sequences(token_text, maxlen=max_len, padding='pre')
    
    # Reshape the padded sequence to match the input shape expected by the model
    # Model expects input of shape (None, 28), so we ensure the input is (1, 28)
    padded_token_text = np.array(padded_token_text)  # Ensure it's a numpy array
    padded_token_text = padded_token_text.reshape(1, max_len)  # Reshape to (1, 28)

    # Predict the next word position
    pos = np.argmax(model.predict(padded_token_text), axis=-1)
    
    # Map the predicted position back to the word
    for words, index in tkn.word_index.items():
        if index == pos:
            word = word + ' ' + words
            print(word)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 719ms/step
patterns and
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
patterns and testing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
patterns and testing machine
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
patterns and testing machine learning
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
patterns and testing machine learning models
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
patterns and testing machine learning models are
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
patterns and testing machine learning models are trained
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
patterns and testing machine learning models are trained on
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
patterns and testing machine learning m