In [10]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Load the dataset
data = pd.read_csv('C:\\Users\\Dell\\OneDrive\\Desktop\\CODETECH\\sentiment\\movie\\dataset.csv')
  # Replace with your file path

# Display the first few rows of the dataset
print("Dataset Preview:")
print(data.head())

# Step 2: Check for missing values
print("\nChecking for missing values...")
print(data.isnull().sum())

# Assuming the dataset has 'review' and 'sentiment' columns
# Replace these column names with the actual ones if different
assert 'text' in data.columns and 'sentiment' in data.columns, "Dataset must have 'text' and 'sentiment' columns"
# Step 3: Preprocess the data
# Remove missing values
data.dropna(subset=['text', 'sentiment'], inplace=True)

# Map sentiment labels to binary values if necessary (e.g., positive -> 1, negative -> 0)
data['sentiment'] = data['sentiment'].map({'pos': 1, 'neg': 0})

# Step 4: Split the dataset
X = data['text']  # Use 'text' instead of 'review'
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 6: Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Step 7: Evaluate the model
y_pred = model.predict(X_test_tfidf)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy:", accuracy_score(y_test, y_pred))

Dataset Preview:
                                                text sentiment
0  My daughter liked it but I was aghast, that a ...       neg
1  I... No words. No words can describe this. I w...       neg
2  this film is basically a poor take on the old ...       neg
3  This is a terrible movie, and I'm not even sur...       neg
4  First of all this movie is a piece of reality ...       pos

Checking for missing values...
text         0
sentiment    0
dtype: int64

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.82      0.83       101
           1       0.82      0.85      0.84        99

    accuracy                           0.83       200
   macro avg       0.84      0.84      0.83       200
weighted avg       0.84      0.83      0.83       200


Accuracy: 0.835
