In [2]:
import os
import pandas as pd

# Step 1: Load the dataset
def load_data(neg_folder='neg', pos_folder='pos'):
    neg_reviews = []
    pos_reviews = []

    # Load negative reviews
    for filename in os.listdir(neg_folder):
        with open(os.path.join(neg_folder, filename), 'r', encoding='utf-8') as file:
            neg_reviews.append(file.read())
    
    # Load positive reviews
    for filename in os.listdir(pos_folder):
        with open(os.path.join(pos_folder, filename), 'r', encoding='utf-8') as file:
            pos_reviews.append(file.read())

    # Create a DataFrame
    data = pd.DataFrame({
        'review': neg_reviews + pos_reviews,
        'label': [0] * len(neg_reviews) + [1] * len(pos_reviews)  # 0 for neg, 1 for pos
    })
    return data

# Call the function to load data
dataset = load_data()
print(dataset.head())  # Display the first few rows

                                              review  label
0  plot : two teen couples go to a church party ,...      0
1  the happy bastard's quick movie review \ndamn ...      0
2  it is movies like these that make a jaded movi...      0
3   " quest for camelot " is warner bros . ' firs...      0
4  synopsis : a mentally unstable man undergoing ...      0


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import joblib

# Step 2: Preprocess and train the model
X = dataset['review']
y = dataset['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)

# Train the model
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

# Save the model and vectorizer
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']