In [3]:
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
df = pd.read_csv("Reviews.csv")
print(df.shape)
df.head()


(8187, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [7]:
def get_sentiment(rating):
    if rating > 3:
        return "positive"
    elif rating < 3:
        return "negative"
    return "neutral"

df["sentiment"] = df["Score"].map(get_sentiment)
df = df[df["sentiment"] != "neutral"]

In [9]:
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

def clean_review(review):
    review = review.lower()
    review = re.sub('[^a-z ]', '', review)
    tokens = review.split()
    final_words = []

    for word in tokens:
        if word not in stop_words:
            final_words.append(lemmatizer.lemmatize(word))

    return " ".join(final_words)

df["processed_review"] = df["Text"].apply(clean_review)

In [12]:
sample_df = df.sample(n=3000, random_state=1)

X = sample_df["processed_review"]
y = sample_df["sentiment"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=7
)


In [14]:
tfidf_vector = TfidfVectorizer(
    max_features=6000,
    ngram_range=(1, 2)
)

X_train_vec = tfidf_vector.fit_transform(X_train)
X_test_vec = tfidf_vector.transform(X_test)


In [15]:
sentiment_model = LogisticRegression(solver="liblinear")
sentiment_model.fit(X_train_vec, y_train)


In [16]:
predictions = sentiment_model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, predictions))
print("\nReport:\n", classification_report(y_test, predictions))


Accuracy: 0.856

Report:
               precision    recall  f1-score   support

    negative       1.00      0.03      0.05       111
    positive       0.86      1.00      0.92       639

    accuracy                           0.86       750
   macro avg       0.93      0.51      0.49       750
weighted avg       0.88      0.86      0.79       750



In [18]:
def analyze_sentiment(text):
    text = clean_review(text)
    text_vec = tfidf_vector.transform([text])
    return sentiment_model.predict(text_vec)[0]

print(analyze_sentiment("Amazing product quality and fast delivery"))
print(analyze_sentiment("Waste of money and very bad experience"))


positive
negative
