In [42]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [43]:
df = pd.read_csv("IMDB Dataset.csv")

In [44]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [45]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [47]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [48]:
df['sentiment'].value_counts()


sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [49]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rohit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rohit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [50]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [51]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    words = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]
    return " ".join(words)

In [52]:
df["cleaned_review"] = df["review"].apply(clean_text)


In [53]:
from sklearn.model_selection import train_test_split

X = df["cleaned_review"]
y = df["sentiment"]

In [54]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,2))

In [56]:
# Fit transform on training data
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform test data
X_test_tfidf = tfidf.transform(X_test)

In [57]:

# Logistic Regression model
model = LogisticRegression(max_iter=300)
model.fit(X_train_tfidf, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,300


In [58]:
# Predictions
y_pred = model.predict(X_test_tfidf)

In [59]:
# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9022

Classification Report:
               precision    recall  f1-score   support

    negative       0.91      0.89      0.90      5000
    positive       0.89      0.91      0.90      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [60]:
from datasets import Dataset
import pandas as pd

In [61]:
# Make sure your dataframe has "review" and "sentiment"
train_df = pd.DataFrame({"text": X_train, "label": y_train})
test_df  = pd.DataFrame({"text": X_test, "label": y_test})

In [62]:
# Convert labels to numeric (positive=1, negative=0)
train_df["label"] = train_df["label"].map({"positive": 1, "negative": 0})
test_df["label"]  = test_df["label"].map({"positive": 1, "negative": 0})

In [63]:
# HuggingFace Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset  = Dataset.from_pandas(test_df)

train_dataset, test_dataset

(Dataset({
     features: ['text', 'label', '__index_level_0__'],
     num_rows: 40000
 }),
 Dataset({
     features: ['text', 'label', '__index_level_0__'],
     num_rows: 10000
 }))

In [64]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)


In [65]:
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset  = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [66]:
train_dataset = train_dataset.remove_columns(["text"])
test_dataset  = test_dataset.remove_columns(["text"])

In [67]:
train_dataset.set_format("torch")
test_dataset.set_format("torch")

In [71]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [72]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="distilbert_sentiment",
    eval_strategy="epoch",        # NEW version naming
    save_strategy="epoch",
    logging_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=100,
    fp16 = True
)


In [73]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.2787,0.253547
2,0.1848,0.266625


TrainOutput(global_step=5000, training_loss=0.2474961051940918, metrics={'train_runtime': 943.1934, 'train_samples_per_second': 84.818, 'train_steps_per_second': 5.301, 'total_flos': 5298695946240000.0, 'train_loss': 0.2474961051940918, 'epoch': 2.0})

In [74]:
results = trainer.evaluate()
results


{'eval_loss': 0.26662513613700867,
 'eval_runtime': 26.6274,
 'eval_samples_per_second': 375.553,
 'eval_steps_per_second': 23.472,
 'epoch': 2.0}

In [75]:
trainer.save_model("distilbert_imdb_model")
tokenizer.save_pretrained("distilbert_imdb_model")


('distilbert_imdb_model\\tokenizer_config.json',
 'distilbert_imdb_model\\special_tokens_map.json',
 'distilbert_imdb_model\\vocab.txt',
 'distilbert_imdb_model\\added_tokens.json',
 'distilbert_imdb_model\\tokenizer.json')

In [78]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch




In [79]:
# Load model + tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert_imdb_model")
model = AutoModelForSequenceClassification.from_pretrained("distilbert_imdb_model")

# GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [80]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=256).to(device)
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    label_map = {0: "Negative", 1: "Positive"}
    return label_map[predicted_class]

In [83]:
predict_sentiment("I really did hate this movie!")


'Negative'

In [84]:
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model
tokenizer = AutoTokenizer.from_pretrained("distilbert_imdb_model")
model = AutoModelForSequenceClassification.from_pretrained("distilbert_imdb_model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=256).to(device)
    outputs = model(**inputs)
    logits = outputs.logits
    return torch.argmax(logits, dim=1).item()

st.title("ðŸŽ¬ IMDb Sentiment Analysis with DistilBERT")

user_input = st.text_area("Enter a movie review:")

if st.button("Analyze"):
    pred = predict(user_input)
    sentiment = "Positive ðŸ˜Š" if pred == 1 else "Negative ðŸ˜ "
    st.subheader(f"Sentiment: {sentiment}")


2025-12-01 16:50:59.382 
  command:

    streamlit run C:\Users\rohit\AppData\Roaming\Python\Python311\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-12-01 16:50:59.386 Session state does not function when running a script without `streamlit run`
