##Loading the dataset

In [1]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "IMDB Dataset.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "lakshmi25npathi/imdb-dataset-of-50k-movie-reviews",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

  df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.
First 5 records:                                               review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [2]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
sample_text= df['review'][0]
print(sample_text)

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

##preprocessing

In [4]:
df = df.head(100)

# =========================================================
# Preprocessing
# =========================================================
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# =========================================================
# Download NLTK Resources
# =========================================================
nltk.download('punkt')
nltk.download('stopwords')

# =========================================================
# Define text cleaning + tokenization function
# =========================================================
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Clean text
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.strip().lower()

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords and non-alphabetic tokens
    tokens = [word for word in tokens if word not in stop_words and word.isalpha()]

    # Join back to a clean string
    return ' '.join(tokens)

# =========================================================
# Apply preprocessing to all reviews
# =========================================================
df['cleaned_review'] = df['review'].apply(preprocess_text)

# Show example
print("Original text:\n", df['review'][0])
print("\nCleaned and tokenized text:\n", df['cleaned_review'][0])


Original text:
 One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show i

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# =========================================================
# Feature Extraction
# =========================================================
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_review'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (100, 4834)


In [6]:
# =========================================================
# Splitting Data
# =========================================================
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_review'],
    df['sentiment'],
    test_size=0.2,
    random_state=42
)

print("Shape of training data (X_train):", len(X_train))
print("Shape of testing data (X_test):", len(X_test))
print("Shape of training labels (y_train):", y_train.shape)
print("Shape of testing labels (y_test):", y_test.shape)


Shape of training data (X_train): 80
Shape of testing data (X_test): 20
Shape of training labels (y_train): (80,)
Shape of testing labels (y_test): (20,)


## step 2

In [7]:
# =========================================================
# Random Search Optimization (Faster Alternative)
# =========================================================
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from scipy.stats import uniform
import numpy as np

# Define pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('svc', SVC())
])

# Define random search space
param_distributions = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'svc__C': uniform(1, 5),
    'svc__kernel': ['linear']
}

# Random search (n_iter=2 for speed; increase to 20–50 later)
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=2,
    scoring='accuracy',
    cv=3,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

print("Running RandomizedSearchCV...")
random_search.fit(X_train, y_train)


Running RandomizedSearchCV...
Fitting 3 folds for each of 2 candidates, totalling 6 fits


## Step-3

In [8]:
import random

# Define a discrete search space for SVM + TF-IDF
search_space = {
    'C': [0.1, 1, 10],
    'kernel': ['linear'],
    'ngram_range': [(1, 1), (1, 2)]
}

# Function to generate a random combination
def random_hyperparams():
    return {
        'C': random.choice(search_space['C']),
        'kernel': random.choice(search_space['kernel']),
        'ngram_range': random.choice(search_space['ngram_range'])
    }

# Example
initial_params = random_hyperparams()
print("Initial random parameters:", initial_params)


Initial random parameters: {'C': 10, 'kernel': 'linear', 'ngram_range': (1, 1)}


In [9]:
## helper functions
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

def evaluate_model(params, X_train, y_train, X_val, y_val):
    """Train model with given params and return validation accuracy."""
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=params['ngram_range'])),
        ('svc', SVC(C=params['C'], kernel=params['kernel']))
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    return accuracy_score(y_val, y_pred)


In [10]:
## hill climbing
# Split training data into training + validation sets
X_subtrain, X_val, y_subtrain, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize with a random configuration
current_params = random_hyperparams()
current_score = evaluate_model(current_params, X_subtrain, y_subtrain, X_val, y_val)

print("Initial parameters:", current_params)
print("Initial validation accuracy:", current_score)

# Hill climbing loop
improved = True
iteration = 0

while improved:
    improved = False
    neighbors = []

    # Generate neighbors by changing one hyperparameter at a time
    for key in search_space:
        for value in search_space[key]:
            if value != current_params[key]:
                neighbor = current_params.copy()
                neighbor[key] = value
                neighbors.append(neighbor)

    # Evaluate neighbors
    best_neighbor = None
    best_score = current_score

    for neighbor in neighbors:
        score = evaluate_model(neighbor, X_subtrain, y_subtrain, X_val, y_val)
        print(f"Neighbor {neighbor} --> Accuracy: {score:.4f}")
        if score > best_score:
            best_score = score
            best_neighbor = neighbor

    # Move to the neighbor if it improves performance
    if best_neighbor:
        current_params = best_neighbor
        current_score = best_score
        improved = True
        print(f"\nIteration {iteration}: Improved --> {current_params} | Score: {current_score:.4f}\n")
    else:
        print("\nNo improvement found. Stopping search.\n")

    iteration += 1

print("Final best parameters:", current_params)
print("Final validation accuracy:", current_score)


Initial parameters: {'C': 10, 'kernel': 'linear', 'ngram_range': (1, 1)}
Initial validation accuracy: 0.8125
Neighbor {'C': 0.1, 'kernel': 'linear', 'ngram_range': (1, 1)} --> Accuracy: 0.8125
Neighbor {'C': 1, 'kernel': 'linear', 'ngram_range': (1, 1)} --> Accuracy: 0.8125
Neighbor {'C': 10, 'kernel': 'linear', 'ngram_range': (1, 2)} --> Accuracy: 0.8125

No improvement found. Stopping search.

Final best parameters: {'C': 10, 'kernel': 'linear', 'ngram_range': (1, 1)}
Final validation accuracy: 0.8125


In [11]:
## Eval
# Train final model using best found parameters
final_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=current_params['ngram_range'])),
    ('svc', SVC(C=current_params['C'], kernel=current_params['kernel']))
])

final_pipeline.fit(X_train, y_train)
y_pred_test = final_pipeline.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred_test)
print("Test set accuracy with Hill Climbing optimized model:", test_accuracy)


Test set accuracy with Hill Climbing optimized model: 0.45


# step-5

In [12]:
# =========================================================
# Step 5: Improved Hill Climbing Optimization
# =========================================================
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a discrete hyperparameter search space
search_space = {
    'C': [0.1, 1, 5, 10],
    'kernel': ['linear'],
    'ngram_range': [(1, 1), (1, 2)]
}

# Generate a random hyperparameter combination
def random_hyperparams():
    return {
        'C': random.choice(search_space['C']),
        'kernel': random.choice(search_space['kernel']),
        'ngram_range': random.choice(search_space['ngram_range'])
    }

# Model evaluation function
def evaluate_model(params, X_train, y_train, X_val, y_val):
    """Train model with given params and return validation accuracy."""
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=params['ngram_range'], max_features=5000, stop_words='english')),
        ('svc', SVC(C=params['C'], kernel=params['kernel']))
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    return accuracy_score(y_val, y_pred)

# =========================================================
# Train/Validation split
# =========================================================
X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# =========================================================
# Improved Hill Climbing (First-Ascent + Random Restarts)
# =========================================================
num_restarts = 5              # how many random restarts
best_overall_params = None
best_overall_score = 0.0

for restart in range(num_restarts):
    print(f"\n========== Restart {restart+1}/{num_restarts} ==========")

    current_params = random_hyperparams()
    current_score = evaluate_model(current_params, X_train_sub, y_train_sub, X_val, y_val)
    print(f"Initial params: {current_params} | Score: {current_score:.4f}")

    improved = True
    iteration = 0

    while improved:
        improved = False

        # Generate all neighbors
        for key in search_space:
            for value in search_space[key]:
                if value != current_params[key]:
                    neighbor = current_params.copy()
                    neighbor[key] = value

                    score = evaluate_model(neighbor, X_train_sub, y_train_sub, X_val, y_val)
                    print(f"Neighbor {neighbor} --> {score:.4f}")

                    # ✅ First-Ascent: Accept the *first* better neighbor
                    if score > current_score:
                        current_params = neighbor
                        current_score = score
                        improved = True
                        print(f"--> Moved to better neighbor: {current_params} | Score: {current_score:.4f}")
                        break  # first-ascent → stop after first improvement
            if improved:
                break  # break outer loop too

        iteration += 1

    # Track the best solution across all restarts
    if current_score > best_overall_score:
        best_overall_score = current_score
        best_overall_params = current_params

    print(f"Best from this restart: {current_params} | Score: {current_score:.4f}")

# =========================================================
# Final best parameters
# =========================================================
print("\n=========== Overall Best Result ===========")
print("Best Parameters:", best_overall_params)
print("Best Validation Accuracy:", best_overall_score)

# =========================================================
# Final evaluation on test data
# =========================================================
final_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=best_overall_params['ngram_range'], max_features=5000, stop_words='english')),
    ('svc', SVC(C=best_overall_params['C'], kernel=best_overall_params['kernel']))
])

final_pipeline.fit(X_train, y_train)
y_pred_test = final_pipeline.predict(X_test)
test_acc = accuracy_score(y_test, y_pred_test)
print("\nTest Accuracy with Improved Hill Climbing:", test_acc)



Initial params: {'C': 10, 'kernel': 'linear', 'ngram_range': (1, 1)} | Score: 0.8125
Neighbor {'C': 0.1, 'kernel': 'linear', 'ngram_range': (1, 1)} --> 0.8125
Neighbor {'C': 1, 'kernel': 'linear', 'ngram_range': (1, 1)} --> 0.8125
Neighbor {'C': 5, 'kernel': 'linear', 'ngram_range': (1, 1)} --> 0.8125
Neighbor {'C': 10, 'kernel': 'linear', 'ngram_range': (1, 2)} --> 0.8125
Best from this restart: {'C': 10, 'kernel': 'linear', 'ngram_range': (1, 1)} | Score: 0.8125

Initial params: {'C': 10, 'kernel': 'linear', 'ngram_range': (1, 1)} | Score: 0.8125
Neighbor {'C': 0.1, 'kernel': 'linear', 'ngram_range': (1, 1)} --> 0.8125
Neighbor {'C': 1, 'kernel': 'linear', 'ngram_range': (1, 1)} --> 0.8125
Neighbor {'C': 5, 'kernel': 'linear', 'ngram_range': (1, 1)} --> 0.8125
Neighbor {'C': 10, 'kernel': 'linear', 'ngram_range': (1, 2)} --> 0.8125
Best from this restart: {'C': 10, 'kernel': 'linear', 'ngram_range': (1, 1)} | Score: 0.8125

Initial params: {'C': 1, 'kernel': 'linear', 'ngram_range':