# Amazon Reviews Sentiment Analysis
Step-by-step NLP project: cleaning, preprocessing, visualization, TF-IDF, ML model, evaluation.

In [None]:
# Install libraries (run once if needed)
# !pip install pandas matplotlib scikit-learn nltk wordcloud

In [None]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import nltk, re
nltk.download('stopwords')
from nltk.corpus import stopwords

## Create Dummy Dataset

In [None]:
data = {
    'review_text': [
        'Very smooth and fast response works perfectly',
        'Battery life is amazing lasts many days',
        'Okay product nothing special',
        'Stopped working after one week waste of money',
        'Sound quality is clear and bass is strong',
        'Comfortable but connection drops sometimes',
        'Terrible sound very disappointed',
        'Charges quickly and build quality is good',
        'Heating issue while charging',
        'Works fine does the job'
    ],
    'sentiment': ['positive','positive','neutral','negative','positive','neutral','negative','positive','negative','neutral']
}

df = pd.DataFrame(data)
df.head()

## Cleaning & Preprocessing

In [None]:
df['review_text'] = df['review_text'].str.lower()
df['review_text'] = df['review_text'].str.replace(r'[^a-z\s]', '', regex=True)
stop_words = set(stopwords.words('english'))
df['review_text'] = df['review_text'].apply(lambda x: ' '.join([w for w in x.split() if w not in stop_words]))
df.head()

## Sentiment Distribution

In [None]:
df['sentiment'].value_counts().plot(kind='bar')
plt.show()

## Word Cloud

In [None]:
text = ' '.join(df['review_text'])
wc = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.imshow(wc)
plt.axis('off')
plt.show()

## TF-IDF Feature Engineering

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['review_text'])
y = df['sentiment']

## Train/Test Split & Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

## Evaluation

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))