## Text Classification Using MNB, Logistic Regression, SVM, and KNN on IMDB Reviews Dataset

In [9]:
# Text classification is the process of categorizing text into organized groups.
# For example, spam filtering, sentiment analysis, and topic labeling are common use cases of text classification.

### Step 1: Importing Required Libraries

In [39]:
# Data manipulation
import pandas as pd
import string

# Text processing
import nltk
from nltk.corpus import movie_reviews, stopwords
from nltk.tokenize import word_tokenize

# Feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# Machine learning models
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Evaluation metrics
from sklearn.metrics import classification_report, accuracy_score

# Download necessary NLTK resources
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Step 2:  Load and Preprocess Data

In [63]:
# 1. Load IMDB movie reviews dataset
documents = []
labels = []

for category in movie_reviews.categories():  # "pos" and "neg"
    for fileid in movie_reviews.fileids(category):  # Get review file IDs
        text = movie_reviews.raw(fileid)  # Extract review text
        documents.append(text)
        labels.append('Positive' if category == 'pos' else 'Negative')  # Label as 'pos' or 'neg'

# Create a DataFrame
df = pd.DataFrame({"text": documents, "label": labels})

# Display the first few rows
print(df.head(5))

print(df[df["label"] == "Positive"].head(5))  # Display first 5 positive reviews


                                                text     label
0  plot : two teen couples go to a church party ,...  Negative
1  the happy bastard's quick movie review \ndamn ...  Negative
2  it is movies like these that make a jaded movi...  Negative
3   " quest for camelot " is warner bros . ' firs...  Negative
4  synopsis : a mentally unstable man undergoing ...  Negative
                                                   text     label
1000  films adapted from comic books have had plenty...  Positive
1001  every now and then a movie comes along from a ...  Positive
1002  you've got mail works alot better than it dese...  Positive
1003   " jaws " is a rare film that grabs your atten...  Positive
1004  moviemaking is a lot like being the general ma...  Positive


In [64]:
# 2. Preprocessing Text Data

# Convert text to lowercase
df["text"] = df["text"].str.lower()

# Tokenization (Splitting text into words)
df["text"] = df["text"].apply(lambda x: word_tokenize(x))

# Remove punctuation
df["text"] = df["text"].apply(lambda x: [word for word in x if word not in string.punctuation])

# Remove stopwords
stop_words = set(stopwords.words('english'))  # Get stop words
df["text"] = df["text"].apply(lambda x: [word for word in x if word not in stop_words])

# Convert tokens back into a sentence
df["text"] = df["text"].apply(lambda x: " ".join(x))

# Display the first few rows
print(df.head(5))
print(df[df["label"] == "Positive"].head(5))  # Display first 5 positive reviews

                                                text     label
0  plot two teen couples go church party drink dr...  Negative
1  happy bastard 's quick movie review damn y2k b...  Negative
2  movies like make jaded movie viewer thankful i...  Negative
3  `` quest camelot `` warner bros first feature-...  Negative
4  synopsis mentally unstable man undergoing psyc...  Negative
                                                   text     label
1000  films adapted comic books plenty success wheth...  Positive
1001  every movie comes along suspect studio every i...  Positive
1002  've got mail works alot better deserves order ...  Positive
1003  `` jaws `` rare film grabs attention shows sin...  Positive
1004  moviemaking lot like general manager nfl team ...  Positive


### Step 3: Convert Text Data into Numerical Features

In [70]:
# Convert text into numerical data using TF-IDF (Term Frequency - Inverse Document Frequency) 

tfidf = TfidfVectorizer(max_features = 5000) # maximum feature size (e.g., 5000 features) for efficient processing.
X = tfidf.fit_transform(df['text'])  # Transform text into numerical form
y = df['label']  # Target labels

### Step 4: Split Data into Training and Testing Sets

In [73]:
# 70% training, 30% testing to evaluate model performance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 50)

### Step 5: Train and Evaluate Models

In [74]:
# Define models to train
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter = 1000),
    "SVM": SVC(kernel='linear'),
    "KNN": KNeighborsClassifier(n_neighbors = 5)
}

In [75]:
# Train and evaluate models
accuracies = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Predict labels for test set
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    accuracies[name] = accuracy  # Store accuracy

# Print accuracy and classification report
    print(f"{name} Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("-" * 50)  # Separator for readability

Training Naive Bayes...
Naive Bayes Accuracy: 0.78
Classification Report:
              precision    recall  f1-score   support

    Negative       0.78      0.79      0.78       308
    Positive       0.77      0.76      0.77       292

    accuracy                           0.78       600
   macro avg       0.78      0.78      0.78       600
weighted avg       0.78      0.78      0.78       600

--------------------------------------------------
Training Logistic Regression...
Logistic Regression Accuracy: 0.81
Classification Report:
              precision    recall  f1-score   support

    Negative       0.83      0.78      0.80       308
    Positive       0.78      0.84      0.81       292

    accuracy                           0.81       600
   macro avg       0.81      0.81      0.80       600
weighted avg       0.81      0.81      0.80       600

--------------------------------------------------
Training SVM...
SVM Accuracy: 0.82
Classification Report:
              precisio

### Step 6: Compare Results

In [76]:
# Step 6: Compare Results
# Print final comparison of model accuracies
print("\nModel Comparison:")
for name, acc in accuracies.items():
    print(f"{name}: {acc:.2f}")


Model Comparison:
Naive Bayes: 0.78
Logistic Regression: 0.81
SVM: 0.82
KNN: 0.64


Based on the observed results, the best-performing model is determined by the highest accuracy. Multinomial Naive Bayes (MNB) performed well due to its efficiency in text classification, especially when features are independent. Logistic Regression (LR) was also effective, handling large datasets well, but it struggles with non-linearity. Support Vector Machine (SVM) excelled in high-dimensional spaces but is computationally expensive. K-Nearest Neighbors (KNN) was the least efficient due to its high computational cost and poor scalability for large datasets. Given the accuracy scores, the best-performing model is the most suitable for sentiment analysis in this scenario, providing the best balance between efficiency and accuracy.