In [1]:
# Importing Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download NLTK resources
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# ------------------------------------------
# 1. Load Dataset
# ------------------------------------------
# Load IMDb Dataset
data = pd.read_csv("../data/IMDB Dataset.csv")

# View sample data
data.head()

# Check for missing values
data.isnull().sum()

# ------------------------------------------
# 2. Data Preprocessing
# ------------------------------------------
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply text cleaning
data['cleaned_review'] = data['review'].apply(clean_text)

# Convert sentiment labels to binary
data['sentiment'] = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# ------------------------------------------
# 3. Feature Extraction using TF-IDF
# ------------------------------------------
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = tfidf_vectorizer.fit_transform(data['cleaned_review']).toarray()

# Save TF-IDF model
with open('../preprocessing/tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

# Target Variable
y = data['sentiment']

# Split Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ------------------------------------------
# 4. Model Training and Evaluation
# ------------------------------------------
def train_and_evaluate_model(model, model_name):
    # Train Model
    model.fit(X_train, y_train)
    # Predictions
    y_pred = model.predict(X_test)
    # Model Evaluation
    print(f"Model: {model_name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    
    # Save the model
    with open(f'../models/{model_name}.pkl', 'wb') as file:
        pickle.dump(model, file)

    return model

# 1. Logistic Regression
logistic_model = LogisticRegression(max_iter=200)
train_and_evaluate_model(logistic_model, "logistic_model")

# 2. Naive Bayes
naive_bayes_model = MultinomialNB()
train_and_evaluate_model(naive_bayes_model, "naive_bayes_model")

# 3. Support Vector Machine (SVM)
svm_model = SVC(kernel='linear')
train_and_evaluate_model(svm_model, "svm_model")

# ------------------------------------------
# 5. Model Comparison
# ------------------------------------------
models = ["Logistic Regression", "Naive Bayes", "SVM"]
accuracies = [accuracy_score(y_test, logistic_model.predict(X_test)),
              accuracy_score(y_test, naive_bayes_model.predict(X_test)),
              accuracy_score(y_test, svm_model.predict(X_test))]

# Plot Model Comparison
plt.figure(figsize=(10, 6))
sns.barplot(x=models, y=accuracies, palette='viridis')
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.xlabel('Models')
plt.show()


[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: '../preprocessing/tfidf_vectorizer.pkl'

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from src.preprocessing.text_preprocessing import clean_text
from src.extraction.feature_extraction import extract_features
from src.model_training.model_training import train_and_save_model, evaluate_model
from src.utils.model_utils import load_model
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import os

# Load Dataset
data_path = "../data/IMDB Dataset.csv"
df = pd.read_csv(data_path)

# Check Data
df.head()

# Data Preprocessing
df['cleaned_review'] = df['review'].apply(clean_text)

# Split Data
X = df['cleaned_review']
y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Extraction
X_train_tfidf, X_test_tfidf, vectorizer = extract_features(X_train, X_test)

# Model Paths
models_dir = "../models"
os.makedirs(models_dir, exist_ok=True)

# Logistic Regression
logistic_model = LogisticRegression(max_iter=1000)
logistic_model_path = os.path.join(models_dir, "logistic_regression.pkl")
train_and_save_model(logistic_model, X_train_tfidf, y_train, logistic_model_path)
logistic_model_loaded = load_model(logistic_model_path)
print("Logistic Regression Results:")
evaluate_model(logistic_model_loaded, X_test_tfidf, y_test)

# Naive Bayes
nb_model = MultinomialNB()
nb_model_path = os.path.join(models_dir, "naive_bayes.pkl")
train_and_save_model(nb_model, X_train_tfidf, y_train, nb_model_path)
nb_model_loaded = load_model(nb_model_path)
print("\nNaive Bayes Results:")
evaluate_model(nb_model_loaded, X_test_tfidf, y_test)

# Support Vector Machine (SVM)
svm_model = SVC(kernel='linear')
svm_model_path = os.path.join(models_dir, "svm_model.pkl")
train_and_save_model(svm_model, X_train_tfidf, y_train, svm_model_path)
svm_model_loaded = load_model(svm_model_path)
print("\nSupport Vector Machine Results:")
evaluate_model(svm_model_loaded, X_test_tfidf, y_test)

# Test on a sample review
def predict_review(review, model_path, vectorizer):
    cleaned_review = clean_text(review)
    vectorized_review = vectorizer.transform([cleaned_review])
    model = load_model(model_path)
    prediction = model.predict(vectorized_review)[0]
    return "Positive" if prediction == 1 else "Negative"

# Sample Prediction
sample_review = "The movie was absolutely fantastic and I loved every part of it!"
result = predict_review(sample_review, logistic_model_path, vectorizer)
print(f"\nSample Review Prediction (Logistic Regression): {result}")


ModuleNotFoundError: No module named 'src'

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from src.data_preprocessing import clean_text
from src.feature_extraction import extract_features
from src.model_training import train_and_save_model, evaluate_model
from src.utils import load_model
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import os

# Load Dataset
data_path = "../data/IMDB Dataset.csv"
df = pd.read_csv(data_path)

# Check Data
df.head()

# Data Preprocessing
df['cleaned_review'] = df['review'].apply(clean_text)

# Split Data
X = df['cleaned_review']
y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Extraction
X_train_tfidf, X_test_tfidf, vectorizer = extract_features(X_train, X_test)

# Model Paths
models_dir = "../models"
os.makedirs(models_dir, exist_ok=True)

# Logistic Regression
logistic_model = LogisticRegression(max_iter=1000)
logistic_model_path = os.path.join(models_dir, "logistic_regression.pkl")
train_and_save_model(logistic_model, X_train_tfidf, y_train, logistic_model_path)
logistic_model_loaded = load_model(logistic_model_path)
print("Logistic Regression Results:")
evaluate_model(logistic_model_loaded, X_test_tfidf, y_test)

# Naive Bayes
nb_model = MultinomialNB()
nb_model_path = os.path.join(models_dir, "naive_bayes.pkl")
train_and_save_model(nb_model, X_train_tfidf, y_train, nb_model_path)
nb_model_loaded = load_model(nb_model_path)
print("\nNaive Bayes Results:")
evaluate_model(nb_model_loaded, X_test_tfidf, y_test)

# Support Vector Machine (SVM)
svm_model = SVC(kernel='linear')
svm_model_path = os.path.join(models_dir, "svm_model.pkl")
train_and_save_model(svm_model, X_train_tfidf, y_train, svm_model_path)
svm_model_loaded = load_model(svm_model_path)
print("\nSupport Vector Machine Results:")
evaluate_model(svm_model_loaded, X_test_tfidf, y_test)

# Test on a sample review
def predict_review(review, model_path, vectorizer):
    cleaned_review = clean_text(review)
    vectorized_review = vectorizer.transform([cleaned_review])
    model = load_model(model_path)
    prediction = model.predict(vectorized_review)[0]
    return "Positive" if prediction == 1 else "Negative"

# Sample Prediction
sample_review = "The movie was absolutely fantastic and I loved every part of it!"
result = predict_review(sample_review, logistic_model_path, vectorizer)
print(f"\nSample Review Prediction (Logistic Regression): {result}")


ModuleNotFoundError: No module named 'src'