In [None]:
# Part 1: Save the model and vectorizer
# Add this to your existing script to save the trained model

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle
import os

# Create directory for models if it doesn't exist
os.makedirs('models', exist_ok=True)

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load your dataset
# You can use your real data or the sample data as in your code
reviews = [
    "This movie was excellent! The acting was superb.",
    "Terrible film, I hated every minute of it.",
    "Great movie with amazing special effects and plot.",
    "Boring and predictable, waste of time.",
    "I loved the characters and the storyline was intriguing."
] * 20

sentiments = [1, 0, 1, 0, 1] * 20  # 1 for positive, 0 for negative

# Create DataFrame
df = pd.DataFrame({
    'text': reviews,
    'sentiment': sentiments
})

# Text Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into text
    processed_text = ' '.join(tokens)
    
    return processed_text

# Apply preprocessing
df['processed_text'] = df['text'].apply(preprocess_text)

# Feature Engineering: TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_text'])

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, df['sentiment'], test_size=0.2, random_state=42
)

# Training model with best hyperparameters (from your grid search)
# Replace these with your actual best parameters
best_params = {'n_estimators': 100, 'max_depth': None}  # Example values
best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(X_train, y_train)

# Evaluate the model
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.4f}")

# Save the vectorizer and model
with open('models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)
    
with open('models/sentiment_model.pkl', 'wb') as f:
    pickle.dump(best_rf, f)

print("Model and vectorizer saved successfully!")