<a href="https://colab.research.google.com/github/Arpitamo/NLP-TOKENISATION/blob/main/Tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# data.py

def load_data():
    sentences = [
        "I love this movie",
        "I hate this movie",
        "This film was amazing",
        "Worst experience ever",
        "Absolutely fantastic!",
        "Not my cup of tea"
    ]
    labels = [1, 0, 1, 0, 1, 0]  # 1 = Positive, 0 = Negative
    return sentences, labels

In [5]:
# tokenizer.py

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def get_tokenizer(sentences, num_words=100):
    tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    padded = pad_sequences(sequences, padding='post')
    return tokenizer, padded

In [3]:
# model.py

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

def build_model(vocab_size=100):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=16),
        GlobalAveragePooling1D(),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [6]:
# data.py
def load_data():
    sentences = [
        "I love this movie",
        "I hate this movie",
        "This film was amazing",
        "Worst experience ever",
        "Absolutely fantastic!",
        "Not my cup of tea"
    ]
    labels = [1, 0, 1, 0, 1, 0]  # 1 = Positive, 0 = Negative
    return sentences, labels

# tokenizer.py
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def get_tokenizer(sentences, num_words=100):
    tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    padded = pad_sequences(sequences, padding='post')
    return tokenizer, padded

# model.py
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

def build_model(vocab_size=100):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=16),
        GlobalAveragePooling1D(),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# train.py
import numpy as np
import json

# Load data
sentences, labels = load_data()
labels = np.array(labels)

# Tokenize and pad
tokenizer, padded = get_tokenizer(sentences)

# Build model
model = build_model()

# Train model
model.fit(padded, labels, epochs=10)

# Save model
model.save("sentiment_model.h5")

# Save tokenizer word index
with open("word_index.json", "w") as f:
    json.dump(tokenizer.word_index, f)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.5000 - loss: 0.6946
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.5000 - loss: 0.6933
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.5000 - loss: 0.6922
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.5000 - loss: 0.6912
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.8333 - loss: 0.6901
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.8333 - loss: 0.6890
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 1.0000 - loss: 0.6881
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 1.0000 - loss: 0.6872
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m



In [7]:
# predict.py

from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json
import os

# Check for required files
if not os.path.exists("sentiment_model.h5") or not os.path.exists("word_index.json"):
    print("Required files not found. Please run train.py first.")
    exit()

# Load model
model = load_model("sentiment_model.h5")

# Load tokenizer word index
with open("word_index.json", "r") as f:
    word_index = json.load(f)

# Recreate tokenizer
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.word_index = word_index

# New sentences to predict
test_sentences = [
    "I really enjoyed the movie",
    "It was a waste of time",
    "Absolutely loved it",
    "Terrible plot and acting"
]

# Convert and pad
test_seq = tokenizer.texts_to_sequences(test_sentences)
test_pad = pad_sequences(test_seq, padding='post', maxlen=model.input_shape[1])

# Predict
predictions = model.predict(test_pad)
for sentence, score in zip(test_sentences, predictions):
    sentiment = "Positive" if score > 0.5 else "Negative"
    print(f"{sentence} → {sentiment} ({score[0]:.2f})")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step
I really enjoyed the movie → Positive (0.50)
It was a waste of time → Positive (0.50)
Absolutely loved it → Positive (0.50)
Terrible plot and acting → Positive (0.50)
