In [16]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
#libraries for text processing
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')



In [2]:
from google.colab import drive
drive.mount('/content.drive')

Mounted at /content.drive


In [3]:
file_path=('/content/drive/MyDrive/Projects/IMDB Dataset.csv')
df=pd.read_csv(file_path)

In [4]:
#dataset preview
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
#total count
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [7]:
#converted sentiment values into binary
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

**Text Processing**

In [9]:
# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# Define a preprocessing function

def preprocess_text(text):
    text = text.lower()  # lowercase
    words = word_tokenize(text)  # # Tokenize into words
      # Remove punctuation and stopwords, and lemmatize

    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and word not in string.punctuation]
    return ' '.join(words)

# **PreProcessing**

In [10]:
# Apply preprocessing
df['clean_review'] = df['review'].apply(preprocess_text)
df.head()

Unnamed: 0,review,sentiment,clean_review
0,One of the other reviewers has mentioned that ...,1,one reviewer mentioned watching 1 oz episode '...
1,A wonderful little production. <br /><br />The...,1,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically 's family little boy jake think 's z...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei 's `` love time money '' visuall...



# Feature engeneering using TF-IDF

In [11]:
# Convert text to TF-IDF features (limit to 5000 most important words)
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['clean_review'])  # Feature matrix
y = df['sentiment']  # Target labels

# **Spliting data into training and testing**

In [12]:
# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Train a Logistic Regression Model**

In [13]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)


In [14]:
# Make predictions on the test set
y_pred = model.predict(X_test)

Model Evaluation

In [17]:
# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.8879

Classification Report:

              precision    recall  f1-score   support

           0       0.90      0.87      0.89      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [18]:
# Define function to predict sentiment of any review
def predict_sentiment(text):
    processed = preprocess_text(text)
    vector = tfidf.transform([processed])
    prediction = model.predict(vector)[0]
    return "Positive 😊" if prediction == 1 else "Negative 😞"

# Try a sample review
predict_sentiment("The movie was absolutely amazing and inspiring!")

'Positive 😊'