In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [None]:
df = pd.read_csv("IMDB Dataset.csv");
df.head()

Text Cleaning:

Lowercasing: Convert all text to lowercase to ensure consistency.

In [None]:
df['review'] = df['review'].str.lower()

In [None]:
df.head()

  Remove HTML Tags: Some reviews may contain HTML tags. Remove them using regular expressions.

In [None]:
df['review'] = df['review'].str.replace('<br /><br />', ' ')

In [None]:
df.sample(10)

Remove Special Characters and Numbers: Remove non-alphabetical characters and numbers, as they may not be relevant for sentiment analysis.

In [None]:
df['review'] = df['review'].str.replace('[^a-zA-Z\s]', '')

In [None]:
df.sample(10)

Tokenization:

Tokenization is the process of splitting text into individual words or tokens.

In [None]:
import nltk
nltk.download('punkt')

In [None]:
df['review'] = df['review'].apply(word_tokenize)

In [None]:
df.head()

Stopword Removal:
Stopwords are common words (e.g., "and," "the," "is") that often don't carry much meaning in sentiment analysis. You can remove them to reduce noise in your data.

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df['review'] = df['review'].apply(lambda x: [word for word in x if word not in stop_words])

In [None]:
df.head()

Lemmatization or Stemming (Optional):
Lemmatization and stemming reduce words to their base or root form. This can help in reducing the dimensionality of your data and improving model performance.

Stemming:

Stemming involves chopping off the ends of words to remove prefixes or suffixes.
The goal is to reduce words to their "stem" or "root" form.
For example, the word "jumping" would be stemmed to "jump," and "running" would become "run."
Stemming is a simple and fast method, but it may not always produce real words, and the resulting stems may not be valid in all contexts.

Lemmatization:

Lemmatization is a more sophisticated approach that reduces words to their "lemma" or "base form."
It takes into account the word's grammatical meaning and tries to produce a valid word.
For example, the word "better" would be lemmatized to "good," and "went" would become "go."
Lemmatization is a bit slower than stemming because it considers the word's context and meaning, but it often produces more accurate results.

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
#Lemmatization (using WordNet Lemmatizer from NLTK)

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df['review'] = df['review'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [None]:
#Stemming (using Porter Stemmer from NLTK)

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
df['review'] = df['review'].apply(lambda x: [stemmer.stem(word) for word in x])

In [None]:
df.head()

In [None]:
#Join Tokens Back into Sentences

df['review'] = df['review'].apply(lambda x: ' '.join(x))

In [None]:
df.head()

In [None]:
# for future use
df.to_csv('preprocessed_dataset.csv', index=False)

Splitting the Data:

Divide your dataset into two parts: one for training your sentiment analysis model and the other for testing its performance. A common split is 80% of the data for training and 20% for testing.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)


Feature Extraction
TF-IDF (Term Frequency-Inverse Document Frequency): This technique measures the importance of each word in a document relative to a collection of documents.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load preprocessed dataset
df = pd.read_csv('preprocessed_dataset.csv')

# Option 1: TF-IDF Feature Extraction
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # adjust max_features as needed
X_tfidf = tfidf_vectorizer.fit_transform(df['review'])

# X_tfidf now contains TF-IDF feature vectors


In [None]:
import joblib
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(X_tfidf, 'X_tfidf.pkl')

**MODEL SELECTION**

**Naive Bayes**

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets (X_tfidf is your TF-IDF feature matrix)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['sentiment'], test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_nb = nb_model.predict(X_test)

# Evaluate the Naive Bayes model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
report_nb = classification_report(y_test, y_pred_nb)

print(f"Naive Bayes Accuracy: {accuracy_nb:.2f}")
print(report_nb)

**LOGISTIC REGRESSION**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets (X_tfidf is the TF-IDF feature matrix)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['sentiment'], test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lr = lr_model.predict(X_test)

# Evaluate the logistic regression model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
report_lr = classification_report(y_test, y_pred_lr)

print(f"Logistic Regression Accuracy: {accuracy_lr:.2f}")
print(report_lr)


In [None]:
import joblib
joblib.dump(lr_model, 'model.pkl')

TESTING ....

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

# Load the preprocessed dataset and model
df = pd.read_csv('preprocessed_dataset.csv')
# Load the TF-IDF vectorizer and features
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')  # Load the vectorizer
X_tfidf = joblib.load('X_tfidf.pkl')  # Load the TF-IDF features
y = df['sentiment']

# Load the trained Logistic Regression model
model = joblib.load('model.pkl')

# Input text to be analyzed
input_text = input("Enter your text: ")


# Transform the input text into TF-IDF features
input_features = tfidf_vectorizer.transform([input_text])

# Make a prediction using the trained model
prediction = model.predict(input_features)


print(f"Sentiment Analysis Result: {prediction[0]}")
