In [11]:
import os
import nltk
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Download necessary NLTK data
nltk.download('movie_reviews')
nltk.download('stopwords')

# Load the stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# Load the IMDb movie reviews dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Separate the data into reviews (X) and labels (y)
X = [' '.join(doc) for doc, category in documents]
y = [category for doc, category in documents]

# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess: Convert text to lowercase, remove stopwords, and apply TF-IDF
vectorizer = TfidfVectorizer(stop_words=stop_words, max_df=0.7, min_df=10)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("Data has been preprocessed with TF-IDF.")



[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/divyadhole/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/divyadhole/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data has been preprocessed with TF-IDF.
Logistic Regression Accuracy: 83.75%
              precision    recall  f1-score   support

         neg       0.84      0.83      0.84       199
         pos       0.84      0.84      0.84       201

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400

SVM Accuracy: 82.25%
              precision    recall  f1-score   support

         neg       0.82      0.82      0.82       199
         pos       0.82      0.83      0.82       201

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



In [12]:
# Train Logistic Regression model
logreg_classifier = LogisticRegression(max_iter=1000)
logreg_classifier.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred_logreg = logreg_classifier.predict(X_test_tfidf)

# Evaluate Logistic Regression model performance
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print(f'Logistic Regression Accuracy: {accuracy_logreg * 100:.2f}%')
print(classification_report(y_test, y_pred_logreg))

Logistic Regression Accuracy: 83.75%
              precision    recall  f1-score   support

         neg       0.84      0.83      0.84       199
         pos       0.84      0.84      0.84       201

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400



In [13]:


# Train SVM model
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred_svm = svm_classifier.predict(X_test_tfidf)

# Evaluate SVM model performance
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f'SVM Accuracy: {accuracy_svm * 100:.2f}%')
print(classification_report(y_test, y_pred_svm))


SVM Accuracy: 82.25%
              precision    recall  f1-score   support

         neg       0.82      0.82      0.82       199
         pos       0.82      0.83      0.82       201

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400

