In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import important libraries 

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Let's load the data and have a look at it

In [None]:
df= pd.read_csv('/kaggle/input/amazon-shopping-reviews-daily-updated/amazon_reviews.csv')

In [None]:
df

In [None]:
# Check the shape of the dataset
print("Shape of the dataset:", df.shape)


In [None]:
# Check for missing values
print("Missing values:\n", df.isnull().sum())

In [None]:
# Check data types
print("Data types:\n", df.dtypes)

# Pre-processing and Data Cleaning

In [None]:
# Renaming columns
df.rename(columns={
    'reviewId': 'review_id',
    'userName': 'username',
    'thumbsUpCount': 'thumbs_up_count',
    'reviewCreatedVersion': 'review_created_version',
    'at':'time',
    'appVersion':'app_version'},
          inplace=True)

# Printing the updated column names
print(df.columns)

In [None]:
#Checking and Removing Duplicate Rows in the Dataset

df.duplicated().sum()
df.drop_duplicates(inplace = True)

print('Number of duplicates after droping them:', df.duplicated().sum())

In [None]:
#Handling missing data
# Fill missing username with 'Unknown'
df['username'].fillna('Unknown', inplace=True)

In [None]:
# Fill missing review_created_version with a placeholder or drop if necessary
df['review_created_version'] = pd.to_datetime(df['review_created_version'], errors='coerce')
# Convert app_version to string
df['app_version'] = df['app_version'].astype(str)

In [None]:
df.head()

In [None]:
# Check for remaining missing values
print("Missing values after cleaning:\n", df.isnull().sum())

In [None]:
# Fill missing content with 'No content'
df['content'].fillna('No content', inplace=True)
# Fill missing values in 'app_version' with 'Unknown'
df['app_version'].fillna('Unknown', inplace=True)

# Drop rows with missing review_created_version as they might be important
df.dropna(subset=['review_created_version'], inplace=True)

In [None]:
# Check for remaining missing values
print("Missing values after cleaning:\n", df.isnull().sum())

In [None]:
# Confirm the shape of the dataset after cleaning
print("Shape of the dataset after cleaning:", df.shape)

# Check data types
print("Data types after cleaning:\n", df.dtypes)

# Analyzing various factors 
**Starting with Sentiment Analysis , that categorized based on the score into:
Positive: Score 4-5
Neutral: Score 3
Negative: Score 1-2**

In [None]:
#Define a function that categorizes scores into sentiments
def categorize_sentiment(score):
    if score >= 4:
        return 'Positive'
    elif score == 3:
        return 'Neutral'
    else:
        return 'Negative'


In [None]:
#Apply the categorize_sentiment function to create a new column 'sentiment'
df['sentiment'] = df['score'].apply(categorize_sentiment)
#Verify the distribution of sentiments
sentiment_counts = df['sentiment'].value_counts()
print(sentiment_counts)

#  Sentiment Distribution by Score

Visualizing the distribution of sentiments based on the score involves creating a visual representation that shows how reviews are categorized into different sentiment groups (e.g., Positive, Neutral, Negative) based on their numerical scores.

In [None]:
# Plot sentiment distribution
sentiment_counts = df['sentiment'].value_counts()

plt.figure(figsize=(8, 5))
sentiment_counts.plot(kind='bar', color=['green', 'orange', 'red'])
plt.title('Sentiment Distribution by Score')
plt.xlabel('Sentiment')
plt.ylabel('Number of Reviews')
plt.xticks(rotation=0)
plt.show()

#  Review Content Analysis
-Most Common Keywords in Reviews

Objective: Identify common keywords in reviews to gain insight into frequently mentioned issues or praises.
Solution: Use text processing to extract and visualize common keywords.

In [None]:
from wordcloud import WordCloud

In [None]:
# Combine all review content into one string
text = ' '.join(df['content'].dropna())

# Generate a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

In [None]:
# Plot the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Keywords in Reviews')
plt.show()

#  Review Length Analysis
-Review Length vs. Rating

 Investigate the relationship between review length and rating.
Solution: Calculate the length of reviews and plot it against ratings.

In [None]:
# Calculate review length
df['review_length'] = df['content'].apply(lambda x: len(x.split()) if pd.notnull(x) else 0)


In [None]:
# Plot review length vs. rating
plt.figure(figsize=(12, 6))
plt.scatter(df['review_length'], df['score'], alpha=0.5)
plt.title('Review Length vs. Rating')
plt.xlabel('Review Length (Number of Words)')
plt.ylabel('Rating')
plt.grid(True)
plt.show()

#  Thumbs Count Analysis
Investigate if the number of thumbs up influences review scores.

Plot review scores against the number of thumbs up to analyze any correlation. This can be visualized using a scatter plot.

In [None]:
# Plot review scores vs. thumbs up count
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df, x='thumbs_up_count', y='score', alpha=0.5)
plt.title('Review Score vs. Thumbs Up Count')
plt.xlabel('Thumbs Up Count')
plt.ylabel('Review Score')
plt.grid(True)
plt.show()


# Correlation Analysis
Correlation Between Review Score and Thumbs Up Count

Determine if there is a correlation between review scores and the number of thumbs up.
Approach: Calculate and visualize the correlation.

In [None]:
correlation = df[['score', 'thumbs_up_count']].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()


# Conclusion
In this project, we analyzed Amazon shopping reviews to uncover insights into customer sentiments, review patterns, and engagement metrics. Hereâ€™s a summary of what was accomplished:

**Data Cleaning and Preparation:**

Handled missing values and data inconsistencies to ensure robust analysis.
Renamed columns for clarity and standardized data formats.
**Exploratory Data Analysis (EDA):**

**Score Distribution:** Visualized the distribution of review scores to understand the prevalence of different ratings.
**Sentiment Analysis:** Analyzed sentiment trends and distribution, revealing patterns in customer satisfaction.
**Correlation and Trends:**

**Review Length vs. Rating:** Investigated the relationship between the length of reviews and their ratings.
**Thumbs Up Count:** Analyzed the correlation between thumbs-up counts and review scores.
**Sentiment vs. Thumbs Up:** Explored how sentiment scores relate to thumbs-up counts, providing insights into customer feedback.
**Keyword Analysis:**

Identified and visualized the most common keywords in positive and negative reviews, offering insights into frequent themes and issues.
**Key Findings:**
**Review Score Distribution:** The distribution of review scores indicates that negative reviews are more prevalent than positive reviews. This suggests that customers have more dissatisfaction than satisfaction, which could be an area of concern for the company.
**Sentiment Trends:** Due to the daily update frequency of the data, a comprehensive trend analysis over time was not conducted. Instead, the focus was on the overall sentiment distribution and patterns.
**Review Content Analysis:
-Most Common Keywords in ReviewsThe analysis reveals that users frequently discuss their overall experience with Amazon, app performance, ordering process, account management, customer service, and their needs for improvements.
**Review Length Impact:** Longer reviews tend to have lower ratings, which might reflect a more balanced view where detailed feedback includes both positive and negative aspects. Customers who write longer reviews often highlight both strengths and weaknesses, leading to an overall lower rating.
Thumbs-up counts are positively correlated with higher ratings, suggesting that reviews which receive more thumbs-up are generally rated higher. This indicates that reviews which resonate well with other users are often perceived as more positive and valuable.
**Future Work:**
Incorporate Product Information: Including product-specific data could provide deeper insights into review patterns and sentiments.
Extended Sentiment Analysis: Using advanced sentiment analysis techniques or sentiment lexicons might yield more nuanced results.
Trend Analysis: Investigating seasonal or event-driven trends could reveal additional patterns in customer feedback.
Acknowledgments:
**Data Source:**  Kaggle
**Libraries and Tools:** The analysis was conducted using Python, with key libraries including Pandas for data manipulation and cleaning, NumPy for numerical operations, Matplotlib and Seaborn for data visualization. Jupyter Notebook was used as the development environment, and the project was executed on Kaggle Kernels for ease of sharing and collaboration.

Thank you for reviewing this analysis. I hope you found it insightful!








