In [79]:
import re
import pandas as pd
import nltk

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [80]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [81]:
stop_words=set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [82]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
t=['loves','loving', 'loved', 'lovely','beautiful']
lemmatized_list=[lemmatizer.lemmatize(word,'v') for word in t]
lemmatized_list

['love', 'love', 'love', 'lovely', 'beautiful']

In [83]:
stemmer=PorterStemmer()
t=['loves','loving', 'loved', 'lovely','beautiful']
stemmed_list=[stemmer.stem(word) for word in t]
stemmed_list

['love', 'love', 'love', 'love', 'beauti']

In [84]:
text=["I love this product",
    "This is the worst experience",
    "Amazing quality and great support",
    "I hate this so much",
    "Not bad, could be better",
    "Absolutely fantastic"]
labels=[1,0,1,0,0,1]

In [85]:
def preprocess(text):
  if isinstance(text, list):
    return [preprocess(t) for t in text]
  text=text.lower()
  text=re.sub(r'[^a-z\s]','', text)
  words=text.split()
  words=[stemmer.stem(word) for word in words if word not in stop_words]
  return " ".join(words)
preprocess(text)

['love product',
 'worst experi',
 'amaz qualiti great support',
 'hate much',
 'bad could better',
 'absolut fantast']

In [86]:
text_cleaned=preprocess(text)
text_cleaned

['love product',
 'worst experi',
 'amaz qualiti great support',
 'hate much',
 'bad could better',
 'absolut fantast']

In [87]:
sentences = [
    "I love NLP",
    "I love Python",
    "Python loves data",
    "Today is a very beautiful day",
    "I love beautiful peoples ",
    "we all are a part of family",
    " a b c d e f g h i j k l m n  o p q r s t u v w x y z"
]

In [88]:
vectorizer=CountVectorizer()
X = vectorizer.fit_transform(text)
vectorizer.vocabulary_


{'love': 12,
 'this': 20,
 'product': 15,
 'is': 11,
 'the': 19,
 'worst': 21,
 'experience': 7,
 'amazing': 1,
 'quality': 16,
 'and': 2,
 'great': 9,
 'support': 18,
 'hate': 10,
 'so': 17,
 'much': 13,
 'not': 14,
 'bad': 3,
 'could': 6,
 'be': 4,
 'better': 5,
 'absolutely': 0,
 'fantastic': 8}

In [89]:
from datasets import load_dataset
dataset = load_dataset("imdb")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [90]:
X_train=dataset["train"]['text']
y_train = dataset["train"]['label']

X_test = dataset["test"]["text"]
y_test = dataset['test']['label']

print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

25000 25000
25000 25000


In [91]:
import re
def clean_review(text):
  text = re.sub(r'<.*?>', ' ', text)
  text = re.sub(r'\s+', ' ', text)
  return text.strip()

In [92]:
X_train_clean=[clean_review(r) for r in X_train]
X_test_clean = [clean_review(r) for r in X_test]

In [93]:
vectorizer = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1,2),
    min_df=5,
    max_df=0.9
)

X_train_vec=vectorizer.fit_transform(X_train_clean)
X_test_vec = vectorizer.transform(X_test_clean)

In [94]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train);

In [95]:
predictions = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, predictions)

print("Accuracy:", accuracy)

Accuracy: 0.89372


In [96]:
label_map = {0: "Negative", 1: "Positive"}

def predict_label(sentence):
    clean = clean_review(sentence)
    vec = vectorizer.transform([clean])
    label = model.predict(vec)[0]
    return label_map[label]


In [97]:
print(predict_label("The movie was boring but the ending was great"))
print(predict_label("It was a really an lucky and a very great moment in my life"))
print(predict_label("This was a complete waste of time"))


Negative
Positive
Negative


In [98]:
reviews = [
    "I absolutely loved this movie. The performances were brilliant and the story was engaging from start to finish.",
    "This film was a complete waste of time. The plot was boring and the acting was terrible.",
    "The movie was simple and enjoyable. Nothing extraordinary, but it was pleasant to watch.",
    "The idea was interesting, but the execution was poor and the pacing was very slow.",
    "The movie started well, but the second half was disappointing and ruined the experience.",
    "What an amazing film! I was completely absorbed and emotionally invested throughout.",
    "I hated every minute of this movie. It was confusing, dull, and badly written.",
    "The movie was not good and definitely not worth the hype.",
    "Despite a slow start, the film turns into a powerful and memorable experience.",
    "It was okay. Some parts were forgettable, others were interesting."
]

for r in reviews:
    print(predict_label(r), "→", r)


Positive → I absolutely loved this movie. The performances were brilliant and the story was engaging from start to finish.
Negative → This film was a complete waste of time. The plot was boring and the acting was terrible.
Positive → The movie was simple and enjoyable. Nothing extraordinary, but it was pleasant to watch.
Negative → The idea was interesting, but the execution was poor and the pacing was very slow.
Negative → The movie started well, but the second half was disappointing and ruined the experience.
Positive → What an amazing film! I was completely absorbed and emotionally invested throughout.
Negative → I hated every minute of this movie. It was confusing, dull, and badly written.
Negative → The movie was not good and definitely not worth the hype.
Positive → Despite a slow start, the film turns into a powerful and memorable experience.
Negative → It was okay. Some parts were forgettable, others were interesting.
