# Import Libraries and Dataset

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt 



In [None]:
# Load the TSV file
file_path = 'C:/Users/ANKITA/Downloads/amazon_alexa.tsv'
reviews_df = pd.read_csv(file_path, sep='\t')



In [None]:
reviews_df



In [None]:
reviews_df.info()

In [None]:
reviews_df.describe()

In [None]:
reviews_df['verified_reviews']

# Exploring Dataset

In [None]:
 sns.heatmap(reviews_df.isnull(), yticklabels = False, cbar = False, cmap="Blues")

In [None]:
reviews_df.hist(bins = 30, figsize = (13,5), color = 'r')

In [None]:
reviews_df['verified_reviews'] = reviews_df['verified_reviews'].fillna('').astype(str)


In [None]:
reviews_df['length'] = reviews_df['verified_reviews'].apply(len)

In [None]:
reviews_df.head()

In [None]:
reviews_df['length'].plot(bins=100, kind='hist') 

In [None]:
reviews_df.length.describe()

In [None]:
# Let's see the longest message 43952
reviews_df[reviews_df['length'] == 2851]['verified_reviews'].iloc[0]

In [None]:
# Let's see the shortest message 
reviews_df[reviews_df['length'] == 1]['verified_reviews'].iloc[0]

In [None]:
# Let's see the message with mean length 
reviews_df[reviews_df['length'] == 133]['verified_reviews'].iloc[0]

In [None]:
positive = reviews_df[reviews_df['feedback']==1]
negative = reviews_df[reviews_df['feedback']==0]

In [None]:
positive

In [None]:
negative

In [None]:
 sns.countplot(x='feedback', data=reviews_df)

In [None]:
sns.countplot(x = 'rating', data = reviews_df)

In [None]:
reviews_df['rating'].hist(bins = 5)

In [None]:
plt.figure(figsize = (40,15))
sns.barplot(x = 'variation', y='rating', data = reviews_df, palette = 'deep')

In [None]:
sentences = reviews_df['verified_reviews'].tolist()
len(sentences)

In [None]:
print(sentences)

In [None]:
sentences_as_one_string =" ".join(sentences)

In [None]:
sentences_as_one_string

In [None]:
from wordcloud import WordCloud

plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(sentences_as_one_string))

In [None]:
negative_list = negative['verified_reviews'].tolist()

negative_list

In [None]:
negative_sentences_as_one_string = " ".join(negative_list)

In [None]:
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(negative_sentences_as_one_string))

# PERFORM DATA CLEANING

In [None]:
# Let's drop the date, rating, length
reviews_df = reviews_df.drop(['date', 'rating', 'length'],axis=1)


In [None]:
reviews_df

In [None]:
variation_dummies = pd.get_dummies(reviews_df['variation'], drop_first = True)
# Avoid Dummy Variable trap which occurs when one variable can be predicted from the other.

In [None]:
variation_dummies

In [None]:
# first let's drop the column
reviews_df.drop(['variation'], axis=1, inplace=True)

In [None]:
# Now let's add the encoded column again
reviews_df = pd.concat([reviews_df, variation_dummies], axis=1)

In [None]:
reviews_df

# REMOVING PUNCTUATION FROM TEXT

In [None]:
import string
string.punctuation

import nltk
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')


In [None]:
# Let's define a pipeline to clean up all the messages 
# The pipeline performs the following: (1) remove punctuation, (2) remove stopwords

def message_cleaning(message):
    Test_punc_removed = [char for char in message if char not in string.punctuation]
    Test_punc_removed_join = ''.join(Test_punc_removed)
    Test_punc_removed_join_clean = [word for word in Test_punc_removed_join.split() if word.lower() not in stopwords.words('english')]
    return Test_punc_removed_join_clean

In [None]:
# Let's test the newly added function
reviews_df_clean = reviews_df['verified_reviews'].apply(message_cleaning)

In [None]:
print(reviews_df_clean[3]) # show the cleaned up version

In [None]:
print(reviews_df['verified_reviews'][3]) # show the original version

In [None]:
reviews_df_clean

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Define the cleaning pipeline we defined earlier
vectorizer = CountVectorizer(analyzer = message_cleaning)
reviews_countvectorizer = vectorizer.fit_transform(reviews_df['verified_reviews'])

In [None]:
feature_names = vectorizer.get_feature_names_out()
print(feature_names)

In [None]:
print(reviews_countvectorizer.toarray())  

In [None]:
reviews_countvectorizer.shape

In [None]:
reviews_df

In [None]:
# first let's drop the column
reviews_df.drop(['verified_reviews'], axis=1, inplace=True)
reviews = pd.DataFrame(reviews_countvectorizer.toarray())

In [None]:
# Now let's concatenate them together
reviews_df = pd.concat([reviews_df, reviews], axis=1)

In [None]:
reviews_df

In [None]:
# Let's drop the target label coloumns
X = reviews_df.drop(['feedback'],axis=1)

In [None]:
X

In [None]:
y = reviews_df['feedback']

#  Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# Naive Bayes Training and Evaluation


In [None]:
# Trains a Naive Bayes model.

NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)

y_predict_train = NB_classifier.predict(X_train)


In [None]:
cm = confusion_matrix(y_train, y_predict_train)
sns.heatmap(cm, annot=True)


In [None]:
# Displays a confusion matrix for training predictions.

y_predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True)
print(classification_report(y_test, y_predict_test))


# Logistic Regression Training and Evaluation



In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)


In [None]:
# Testing Set Performance
y_pred

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

print('Accuracy {} %'.format( 100 * accuracy_score(y_pred, y_test)))


In [None]:
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot = True)

In [None]:
print(classification_report(y_test, y_pred))


## Final Conclusion

This project focused on analyzing Amazon Alexa customer reviews to predict whether customers are satisfied based on their written feedback.

### Key Objectives:
- Determine if a customer is **satisfied (1)** or **not satisfied (0)** using the review text.
- Apply Natural Language Processing (NLP) techniques and machine learning models to build a reliable sentiment classifier.

### What We Did:
- Cleaned and preprocessed the review data by removing punctuation and stopwords
- Explored the dataset through visualizations (word clouds, rating distribution, feedback count)
- Converted text data into numeric features using `CountVectorizer`
- Used one-hot encoding for product variation
- Built and evaluated two models:
  - **Naive Bayes**
  - **Logistic Regression**

### Results:
- **Logistic Regression** performed best with an accuracy of **~95%**
- Most customers in the dataset were **satisfied**, resulting in some class imbalance
- Models struggled slightly with predicting the minority class (unsatisfied reviews)

### Final Insight:
**Yes, the majority of customers were satisfied with Amazon Alexa**, as shown by both the data distribution and model predictions.

This project demonstrates how textual reviews can be effectively used to gain insights into customer sentiment using NLP and machine learning. 

