In [None]:
import os
import kagglehub
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Download the IMDB dataset from KaggleHub
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
print("Path to dataset files:", path)

# Load dataset (assuming it's in CSV format)
dataset_path = os.path.join(path, "IMDB Dataset.csv")  # Adjust filename if needed
df = pd.read_csv(dataset_path)

# Check dataset structure
print(df.head())

# Download NLTK resources
nltk.download('punkt')
nltk.download('all')
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing function
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
        text = re.sub(r'\W+', ' ', text)  # Remove punctuation
        words = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        return " ".join(words)
    return ""

# Apply preprocessing
df['processed_text'] = df['review'].apply(preprocess_text)

# Convert labels ('positive' -> 1, 'negative' -> 0)
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Convert text into TF-IDF features
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=10000)  # Use bigrams, limit features
X = vectorizer.fit_transform(df['processed_text'])
y = df['label']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest with hyperparameter tuning
model = RandomForestClassifier(n_estimators=300, max_depth=30, min_samples_split=5, random_state=42)
model.fit(X_train, y_train)

# Evaluate Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Model Accuracy: {accuracy:.4f}")


Downloading from https://www.kaggle.com/api/v1/datasets/download/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?dataset_version_number=1...


100%|██████████| 25.7M/25.7M [00:00<00:00, 56.0MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downlo

Optimized Model Accuracy: 0.8550


In [None]:
import joblib

# Save the trained model
joblib.dump(model, "random_forest_imdb.pkl")

# Save the vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("Model and vectorizer saved successfully!")

Model and vectorizer saved successfully!


In [None]:
# Load the model and vectorizer
loaded_model = joblib.load("random_forest_imdb.pkl")
loaded_vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Example: Predict sentiment of a new review
new_review = ["This movie was an absolute masterpiece with brilliant storytelling."]
new_review_transformed = loaded_vectorizer.transform(new_review)

# Predict sentiment
predicted_sentiment = loaded_model.predict(new_review_transformed)
print("Predicted Sentiment:", "Positive" if predicted_sentiment[0] == 1 else "Negative")

Predicted Sentiment: Positive
