In [36]:
import pandas as pd
import nltk 
import re
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Excel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [37]:
from datasets import load_dataset

In [38]:
dataset = load_dataset("imdb")
df  = pd.DataFrame(dataset["train"])
df=df[["text","label"]]
df.columns=["reviews","sentiment"]
df.head()

Unnamed: 0,reviews,sentiment
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [39]:
df.shape

(25000, 2)

In [40]:
df.isnull().sum()

reviews      0
sentiment    0
dtype: int64

In [41]:
df["sentiment"].value_counts()

0    12500
1    12500
Name: sentiment, dtype: int64

In [42]:
def cleaned_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '',text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["cleaned_reviews"] = df["reviews"].apply(cleaned_text)
df["sentiment"] = df["sentiment"].apply(lambda x: "positive" if x == 1 else "negative")

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
tfidf = TfidfVectorizer(max_features=5000)

In [45]:
X = tfidf.fit_transform(df["cleaned_reviews"])
y = df["sentiment"]

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [47]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,test_size=0.2)

In [48]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

LogisticRegression()

In [49]:
y_pred = lr.predict(X_test)

In [50]:
confusion_matrix(y_pred,y_test)

array([[2200,  264],
       [ 315, 2221]], dtype=int64)

In [51]:
print("accuracy:",accuracy_score(y_pred,y_test)*100)

accuracy: 88.42


In [52]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.89      0.87      0.88      2515
    positive       0.88      0.89      0.88      2485

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



In [54]:
def predict_sentiment(text):
    cleaned = cleaned_text(text)
    vec = tfidf.transform([cleaned])
    prediction = lr.predict(vec)[0]
    return prediction

print(predict_sentiment("I absolutely loved this movie! Great acting and story."))
print(predict_sentiment("It was boring and too slow. Would not recommend."))


positive
negative


In [55]:
import joblib

In [56]:
joblib.dump(lr, "sentiment_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']