In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK stopwords
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

# Basic preprocessing
# Convert the sentiment into a binary variable: positive sentiment to 1, negative to 0
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

print("Removing StopWords")
# Apply the function to remove stopwords
df['review'] = df['review'].apply(lambda x: remove_stopwords(x))

print("Removed Splitting Dataset")
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# Create a pipeline with TfidfVectorizer and RandomForestClassifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model
print("Training started...")
pipeline.fit(X_train, y_train)
print("Training completed!")

# Predict on the test set
print("Predicting on the test set...")
predictions = pipeline.predict(X_test)
print("Prediction completed!")

# Calculate the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Removing StopWords
Removed Splitting Dataset
Training started...
Training completed!
Predicting on the test set...
Prediction completed!
Accuracy: 85.92%
