In [1]:
# 1. Install necessary libraries
!pip install nltk transformers scikit-learn tensorflow pandas numpy

# 2. Download NLTK data (stopwords, wordnet)
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

# 3. Create folders for organization
import os
os.makedirs("models", exist_ok=True)
os.makedirs("data", exist_ok=True)

print("Environment Setup Complete.")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Environment Setup Complete.


In [2]:
%%writefile requirements.txt
nltk
pandas
numpy
scikit-learn
tensorflow
transformers
joblib

Writing requirements.txt


In [3]:
%%writefile text_preprocessing.py
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

# Initialize the tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Takes a raw string and cleans it:
    1. Lowercase
    2. Remove punctuation
    3. Remove stopwords
    4. Lemmatize (run -> running)
    """
    if not isinstance(text, str):
        return ""

    # 1. Lowercase
    text = text.lower()

    # 2. Remove Punctuation (replace with space)
    text = re.sub(f"[{string.punctuation}]", " ", text)

    # 3. Tokenize (Split into words)
    tokens = text.split()

    # 4. Remove Stopwords & Lemmatize
    clean_tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words and len(word) > 2
    ]

    return " ".join(clean_tokens)

if __name__ == "__main__":
    # Test it
    sample = "The movie was absolutely amazing! I loved the acting and the plot was running fast."
    cleaned = clean_text(sample)
    print(f"Original: {sample}")
    print(f"Cleaned:  {cleaned}")

Writing text_preprocessing.py


In [4]:
!python text_preprocessing.py

Original: The movie was absolutely amazing! I loved the acting and the plot was running fast.
Cleaned:  movie absolutely amazing loved acting plot running fast


In [5]:
%%writefile sentiment_analyzer.py
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
from text_preprocessing import clean_text
import tensorflow as tf
import os
from sklearn.datasets import load_files

def train_classic_sentiment():
    print("--- 1. Loading FULL IMDB Dataset ---")

    # Step 1: Download and Extract (Keras handles this automatically)
    dataset_path = tf.keras.utils.get_file(
        fname="aclImdb_v1.tar.gz",
        origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
        extract=True
    )

    # Step 2: Find the folder
    # Keras usually downloads to ~/.keras/datasets/
    # The extracted folder is named 'aclImdb'
    base_dir = os.path.dirname(dataset_path)
    data_dir = os.path.join(base_dir, 'aclImdb', 'train')

    # Check if the folder exists
    if not os.path.exists(data_dir):
        # Fallback: Sometimes Keras returns the directory path itself
        data_dir = os.path.join(dataset_path, 'aclImdb', 'train')

    print(f"Data directory located at: {data_dir}")

    # Step 3: Load Data
    print("Loading data files (This takes a moment)...")
    try:
        data = load_files(data_dir, categories=['pos', 'neg'], encoding='utf-8', decode_error='replace')
    except FileNotFoundError:
        print(f"ERROR: Could not find 'pos' and 'neg' folders inside {data_dir}")
        return

    df = pd.DataFrame({'text': data.data, 'label': data.target})
    print(f"Dataset Shape: {df.shape} (Full Training Data)")

    print("--- 2. Preprocessing (Cleaning 25,000 reviews...) ---")
    df['clean_text'] = df['text'].apply(clean_text)

    X = df['clean_text']
    y = df['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("--- 3. Vectorization (TF-IDF) ---")
    vectorizer = TfidfVectorizer(max_features=10000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    print("--- 4. Training Logistic Regression ---")
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_vec, y_train)

    # Evaluate
    preds = model.predict(X_test_vec)
    acc = accuracy_score(y_test, preds)
    print(f"\nClassic Model Accuracy: {acc*100:.2f}%")

    joblib.dump(model, 'models/classic_sentiment_model.pkl')
    joblib.dump(vectorizer, 'models/tfidf_vectorizer.pkl')
    print("Model saved to models/")

if __name__ == "__main__":
    train_classic_sentiment()

Writing sentiment_analyzer.py


In [6]:
!python sentiment_analyzer.py

2026-02-05 10:07:07.334361: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770286027.369109     905 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770286027.378967     905 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770286027.407180     905 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770286027.407222     905 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770286027.407231     905 computation_placer.cc:177] computation placer alr

In [7]:
# 1. Uninstall the conflicting libraries
!pip uninstall -y tensorflow transformers ml_dtypes jax jaxlib

# 2. Install PyTorch and Transformers (The stable combination)
!pip install torch transformers scikit-learn pandas accelerate

Found existing installation: tensorflow 2.19.0
Uninstalling tensorflow-2.19.0:
  Successfully uninstalled tensorflow-2.19.0
Found existing installation: transformers 5.0.0
Uninstalling transformers-5.0.0:
  Successfully uninstalled transformers-5.0.0
Found existing installation: ml_dtypes 0.5.4
Uninstalling ml_dtypes-0.5.4:
  Successfully uninstalled ml_dtypes-0.5.4
Found existing installation: jax 0.7.2
Uninstalling jax-0.7.2:
  Successfully uninstalled jax-0.7.2
Found existing installation: jaxlib 0.7.2
Uninstalling jaxlib-0.7.2:
  Successfully uninstalled jaxlib-0.7.2
Collecting transformers
  Downloading transformers-5.0.0-py3-none-any.whl.metadata (37 kB)
Downloading transformers-5.0.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
Successfully installed transformers-5.0.0


In [8]:
%%writefile transformer_model.py
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_files
import os
import shutil
import numpy as np

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == 'cpu':
    print("WARNING: You are training BERT on a CPU. This will be very slow (Hours).")

class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

def train_bert_model():
    print("--- 1. Loading FULL Data (25,000 Reviews) ---")

    if not os.path.exists("aclImdb/train"):
        print("Downloading dataset...")
        url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
        if not os.path.exists("aclImdb_v1.tar.gz"):
            os.system(f"wget {url}")
        os.system("tar -xzf aclImdb_v1.tar.gz")

    data_dir = "aclImdb/train"
    data = load_files(data_dir, categories=['pos', 'neg'], encoding='utf-8', decode_error='replace')
    texts = data.data
    labels = data.target

    # --- REMOVED THE LIMIT HERE ---
    # Now using all 25,000 samples
    print(f"Training on {len(texts)} samples.")

    X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

    print("--- 2. Tokenization ---")
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128)
    test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=128)

    train_dataset = IMDBDataset(train_encodings, y_train)
    test_dataset = IMDBDataset(test_encodings, y_test)

    print("--- 3. Training with DistilBERT ---")
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
    model.to(device)

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        logging_dir='./logs',
        logging_steps=100,               # Log less frequently
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()

    print("--- 4. Saving Model ---")
    model.save_pretrained("./models/bert_sentiment")
    tokenizer.save_pretrained("./models/bert_sentiment")
    print("Model saved to ./models/bert_sentiment")

if __name__ == "__main__":
    train_bert_model()

Writing transformer_model.py


In [9]:
!python transformer_model.py

Using device: cuda
--- 1. Loading FULL Data (25,000 Reviews) ---
Downloading dataset...
--2026-02-05 10:10:41--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2026-02-05 10:10:43 (38.4 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]

Training on 25000 samples.
--- 2. Tokenization ---
tokenizer_config.json: 100% 48.0/48.0 [00:00<00:00, 185kB/s]
vocab.txt: 100% 232k/232k [00:00<00:00, 2.10MB/s]
tokenizer.json: 100% 466k/466k [00:00<00:00, 2.87MB/s]
--- 3. Training with DistilBERT ---
config.json: 100% 483/483 [00:00<00:00, 2.32MB/s]
model.safetensors: 100% 268M/268M [00:03<00:00, 71.3MB/s]
Loading weights: 100% 100/100 [00:00<00:00, 1412.95it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_no

In [10]:
%%writefile text_classifier.py
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
import joblib

def train_topic_classifier():
    print("--- 1. Loading 20 Newsgroups Dataset ---")
    # We pick 3 specific categories to make it clear
    categories = ['sci.space', 'sci.med', 'comp.graphics']

    train_data = fetch_20newsgroups(subset='train', categories=categories)
    test_data = fetch_20newsgroups(subset='test', categories=categories)

    print(f"Categories: {train_data.target_names}")
    print(f"Training samples: {len(train_data.data)}")

    print("--- 2. Building Pipeline (TF-IDF + Naive Bayes) ---")
    # Naive Bayes is excellent for text classification
    model = make_pipeline(TfidfVectorizer(), MultinomialNB())

    print("--- 3. Training ---")
    model.fit(train_data.data, train_data.target)

    print("--- 4. Evaluation ---")
    preds = model.predict(test_data.data)
    acc = accuracy_score(test_data.target, preds)
    print(f"Multi-Class Accuracy: {acc*100:.2f}%")

    # Save
    joblib.dump(model, 'models/topic_classifier.pkl')
    print("Topic Classifier saved.")

if __name__ == "__main__":
    train_topic_classifier()

Writing text_classifier.py


In [11]:
!python text_classifier.py

--- 1. Loading 20 Newsgroups Dataset ---
Categories: ['comp.graphics', 'sci.med', 'sci.space']
Training samples: 1771
--- 2. Building Pipeline (TF-IDF + Naive Bayes) ---
--- 3. Training ---
--- 4. Evaluation ---
Multi-Class Accuracy: 94.06%
Topic Classifier saved.


In [12]:
%%writefile inference_api.py
import joblib
from text_preprocessing import clean_text
import sys

def predict_sentiment(text):
    try:
        model = joblib.load('models/classic_sentiment_model.pkl')
        vectorizer = joblib.load('models/tfidf_vectorizer.pkl')

        # Preprocess
        clean = clean_text(text)
        # Vectorize
        vec = vectorizer.transform([clean])
        # Predict
        prob = model.predict_proba(vec)[0]
        pred = model.predict(vec)[0]

        label = "POSITIVE" if pred == 1 else "NEGATIVE"
        confidence = prob[pred]

        return f"Sentiment: {label} ({confidence*100:.1f}%)"
    except Exception as e:
        return "Model not found. Run sentiment_analyzer.py first."

def predict_topic(text):
    try:
        model = joblib.load('models/topic_classifier.pkl')
        categories = ['Graphics', 'Medicine', 'Space'] # The order matches target_names

        pred_idx = model.predict([text])[0]
        return f"Topic: {categories[pred_idx]}"
    except:
        return "Topic Model not found."

if __name__ == "__main__":
    print("--- AI Text Analyst ---")
    sample_text = "The doctor said the surgery was successful."
    print(f"Input: {sample_text}")
    print(predict_sentiment(sample_text))
    print(predict_topic(sample_text))

Writing inference_api.py


In [13]:
!python inference_api.py

--- AI Text Analyst ---
Input: The doctor said the surgery was successful.
Sentiment: NEGATIVE (74.7%)
Topic: Medicine


In [14]:
%%writefile README.md
# Day 5: NLP & Text Analysis

## Project Overview
This project implements Natural Language Processing (NLP) techniques to analyze text data. It includes a classic TF-IDF model, a state-of-the-art BERT model, and a multi-class topic classifier.

## Deliverables
1. **text_preprocessing.py**: Pipeline for cleaning and lemmatizing text.
2. **sentiment_analyzer.py**: Logistic Regression model achieving ~85% on IMDB data.
3. **transformer_model.py**: DistilBERT model for deep context understanding.
4. **text_classifier.py**: Classification system for News topics (Space, Med, Graphics).
5. **inference_api.py**: Simple script to test the models with new inputs.

## How to Run
1. Install dependencies: `pip install -r requirements.txt`
2. Train Sentiment Model: `python sentiment_analyzer.py`
3. Train BERT Model: `python transformer_model.py`
4. Train Topic Classifier: `python text_classifier.py`
5. Test Predictions: `python inference_api.py`

Writing README.md
