In [None]:
import numpy as np
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow_datasets as tfds

# -------------------------------
# 1. Load IMDB dataset
# -------------------------------
(ds_train, ds_test), ds_info = tfds.load(
    'imdb_reviews',
    split=['train', 'test'],
    shuffle_files=True,
    as_supervised=True,
    with_info=True
)

# Convert TFDS datasets to lists
train_texts = []
train_labels = []
for text, label in tfds.as_numpy(ds_train):
    train_texts.append(text.decode('utf-8'))
    train_labels.append(label)

test_texts = []
test_labels = []
for text, label in tfds.as_numpy(ds_test):
    test_texts.append(text.decode('utf-8'))
    test_labels.append(label)

# -------------------------------
# 2. Vectorize text (bag-of-words)
# -------------------------------
vectorizer = CountVectorizer(max_features=2000, binary=True)
X_train = vectorizer.fit_transform(train_texts).toarray()
X_test = vectorizer.transform(test_texts).toarray()

# Ensure input is 0-1 for BernoulliRBM
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = np.array(train_labels)
y_test = np.array(test_labels)

# -------------------------------
# 3. Build DBN-like pipeline
# -------------------------------
rbm1 = BernoulliRBM(n_components=512, learning_rate=0.01, n_iter=5, batch_size=10, random_state=42)
rbm2 = BernoulliRBM(n_components=256, learning_rate=0.01, n_iter=5, batch_size=10, random_state=42)
logistic = LogisticRegression(max_iter=500, solver='lbfgs')

dbn = Pipeline(steps=[
    ('rbm1', rbm1),
    ('rbm2', rbm2),
    ('logistic', logistic)
])

# -------------------------------
# 4. Train DBN
# -------------------------------
print("Training DBN on IMDB (this may take a few minutes)...")
dbn.fit(X_train, y_train)

# -------------------------------
# 5. Predict a few samples
# -------------------------------
preds = dbn.predict(X_test[:5])
print("Predictions:", preds)
print("True labels:", y_test[:5])




Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.203MC3_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.203MC3_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.203MC3_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.
Training DBN on IMDB (this may take a few minutes)...
Predictions: [0 1 0 0 1]
True labels: [1 1 0 0 1]
