In [69]:
!pip install pandas numpy nltk scikit-learn
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re  # Import the 're' module for regular expressions

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [70]:
df = pd.read_csv("/content/IMDB Dataset.csv", on_bad_lines='skip', quoting=3, escapechar='\\')

Labeling

In [None]:
#  labeling
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

Preprocessing

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Check if the text is a string before processing
    if isinstance(text, str):
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        tokens = word_tokenize(text.lower())  # Correct indentation
        tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(tokens)
    # If not a string (eg NaN) return an empty string
    else:
        return ''

In [None]:
df['cleaned_review'] = df['review'].apply(preprocess_text)

df = df.dropna(subset=['sentiment'])  # drops rows with NaN in sentiment column


Feature extraction by TF-IDF

In [None]:
#tfidf for normalization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_review'])
y = df['sentiment']


Test/Train

In [None]:
#training and testing
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Logistic Regression model

In [76]:
model = LogisticRegression()
model.fit(x_train, y_train)

Function to predict sentiment of input text


In [None]:
def predict_sentiment(text):
    cleaned_text = preprocess_text(text)
    text_vector = vectorizer.transform([cleaned_text])
    prediction = model.predict(text_vector)[0]
    return "positive" if prediction == 1 else "negative"

input_text = input("Enter text to analyze: ")

sentiment = predict_sentiment(input_text)
print(f"The sentiment of the input text is: {sentiment}")

#  predictions on test
y_pred = model.predict(x_test)

# evaluating model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


Enter text to analyze: An awful film! It must have been up against some real stinkers to be nominated for the Golden Globe. They've taken the story of the first famous female Renaissance painter and mangled it beyond recognition. My complaint is not that they've taken liberties with the facts; if the story were good, that would perfectly fine. But it's simply bizarre -- by all accounts the true story of this artist would have made for a far better film, so why did they come up with this dishwater-dull script? I suppose there weren't enough naked people in the factual version. It's hurriedly capped off in the end with a summary of the artist's life -- we could have saved ourselves a couple of hours if they'd favored the rest of the film with same brevity.
The sentiment of the input text is: negative
Accuracy: 0.5278
Precision: 0.5085
Recall: 0.8571
F1-score: 0.6383
