# Install necessary dependency

In [7]:
%pip install pandas numpy scikit-learn nltk datasets --quiet

# Data Preparation

In [11]:
from datasets import load_dataset

dataset = load_dataset("imdb")

df = dataset["train"].to_pandas()

df = df.rename(columns={"text": "review", "label": "sentiment"})

print(df.head())

                                              review  sentiment
0  I rented I AM CURIOUS-YELLOW from my video sto...          0
1  "I Am Curious: Yellow" is a risible and preten...          0
2  If only to avoid making this type of film in t...          0
3  This film was probably inspired by Godard's Ma...          0
4  Oh, brother...after hearing about this ridicul...          0


## Split data

In [16]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['review'], df['sentiment'], test_size=0.2, random_state=42
)

# Text Preprocessing

In [20]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

train_texts = train_texts.apply(preprocess_text)
test_texts = test_texts.apply(preprocess_text)

print(train_texts[:5])

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
23311    borrowed movie despite extremely low rating wa...
23623    unexpected accident killed inexperienced climb...
1020     summer blockbuster hit baseketball one movies ...
12645    scarcely imagine better movie thishey go chick...
1533     still famous decadent actor morgan freeman fil...
Name: review, dtype: object


# Convert Text to Vectors

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

print(X_train.shape)

(20000, 5000)


# Training Model

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression()
model.fit(X_train, train_labels)

y_pred = model.predict(X_test)

accuracy = accuracy_score(test_labels, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.88


# Testing

In [26]:
def predict_sentiment(text):
    processed_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([processed_text])
    prediction = model.predict(vectorized_text)[0]
    return "Positive" if prediction == 1 else "Negative"

In [25]:
import ipywidgets as widgets

output = widgets.Output()

text_input = widgets.Text(
    description='Input Text:', 
    placeholder='Type something here'
)

submit_btn = widgets.Button(
    description='Submit',
    button_style='info'
)

def process_input(_):
    text = text_input.value

    with output:
        output.clear_output()
        print(predict_sentiment(text))

submit_btn.on_click(process_input)

layout = widgets.VBox([
    text_input,
    submit_btn,
    output
])

layout

VBox(children=(Text(value='', description='Input Text:', placeholder='Type something here'), Button(button_sty…