In [10]:
# eda

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
from sklearn.feature_extraction.text import CountVectorizer
# Load the dataset
df = pd.read_excel("C:/Users/adedi/OneDrive/Projects/G-customer-review-sentiment/cleaned_reviews.xlsx") 
df.head()

Unnamed: 0,brand,model,price,Ratings,processed_reviews,Sentiment
0,Hey Dude,Wally Linen Natural,$64.99,5,bought son love wear almost everi day definit ...,Positive
1,Hey Dude,Wally Linen Natural,$64.99,5,get lot compliment,Positive
2,Hey Dude,Wally Linen Natural,$64.99,5,love dude,Positive
3,Hey Dude,Wally Linen Natural,$64.99,5,love,Positive
4,Hey Dude,Wally Linen Natural,$64.99,5,probabl favorit,Positive


In [12]:


# Set visualization style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")

# Create the DataFrame from the provided data


# Basic data preparation
# Convert price to numeric
df['price_numeric'] = df['price'].str.replace('$', '').astype(float)

# Check for missing values 
print("Dataset Shape:", df.shape)
print("\nDataset Information:")
print(df.info())

print("\nSummary Statistics:")
print(df.describe(include='all'))

print("\nMissing Values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0] if any(missing_values > 0) else "No missing values")

# Add word count in reviews
df['word_count'] = df['processed_reviews'].str.split().str.len()

# Data Analysis
print("\n--- Distribution Analyses ---")

# Rating Distribution
print("\nRating Distribution:")
rating_counts = df['Ratings'].value_counts().sort_index()
print(rating_counts)

plt.figure(figsize=(10, 6))
sns.countplot(x='Ratings', data=df, order=sorted(df['Ratings'].unique()))
plt.title('Distribution of Ratings')
plt.xlabel('Rating (1-5 stars)')
plt.ylabel('Count')
plt.savefig('rating_distribution.png')
plt.close()

# Price Distribution
plt.figure(figsize=(12, 6))
sns.histplot(df['price_numeric'], bins=10, kde=True)
plt.title('Price Distribution')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.savefig('price_distribution.png')
plt.close()

print(f"\nPrice Range: ${df['price_numeric'].min()} - ${df['price_numeric'].max()}")
print(f"Average Price: ${df['price_numeric'].mean():.2f}")
print(f"Median Price: ${df['price_numeric'].median():.2f}")

# Brand Analysis
print("\nBrands by Frequency:")
brand_counts = df['brand'].value_counts()
print(brand_counts)

plt.figure(figsize=(10, 6))
brand_counts.plot(kind='bar')
plt.title('Brands by Frequency')
plt.xlabel('Brand')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('brands_frequency.png')
plt.close()

# Model Analysis
print("\nModels by Frequency:")
model_counts = df['model'].value_counts()
print(model_counts)

plt.figure(figsize=(12, 6))
model_counts.head(10).plot(kind='bar')
plt.title('Models by Frequency')
plt.xlabel('Model')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('models_frequency.png')
plt.close()

# Rating by Brand Analysis
print("\nAverage Rating by Brand:")
avg_rating_by_brand = df.groupby('brand')['Ratings'].mean().sort_values(ascending=False)
print(avg_rating_by_brand)

plt.figure(figsize=(12, 8))  # Increased figure size for better readability 
sns.barplot(x=avg_rating_by_brand.head(20).index, y=avg_rating_by_brand.head(20).values)
plt.title('Average Rating by Top 20 Brands')
plt.xlabel('Brand')
plt.ylabel('Average Rating')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('avg_rating_by_brand.png')
plt.close()

# Price by Brand Analysis
print("\nAverage Price by Brand:")
avg_price_by_brand = df.groupby('brand')['price_numeric'].mean().sort_values(ascending=False)
print(avg_price_by_brand)

plt.figure(figsize=(12, 8))  # Increased figure size
sns.barplot(x=avg_price_by_brand.head(20).index, y=avg_price_by_brand.head(20).values)
plt.title('Average Price by Top 20 Brands')
plt.xlabel('Brand')
plt.ylabel('Average Price ($)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('avg_price_by_brand.png')
plt.close()

# Price by Model Analysis
print("\nAverage Price by Model:")
avg_price_by_model = df.groupby('model')['price_numeric'].mean().sort_values(ascending=False)
print(avg_price_by_model)

plt.figure(figsize=(14, 8))  # Even larger figure for model names
sns.barplot(x=avg_price_by_model.head(20).index, y=avg_price_by_model.head(20).values)
plt.title('Average Price by Top 20 Models')
plt.xlabel('Model')
plt.ylabel('Average Price ($)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('avg_price_by_model.png')
plt.close()

# Correlation between price and rating
print("\nCorrelation between Price and Rating:")
corr = df[['price_numeric', 'Ratings']].corr()
print(corr)

plt.figure(figsize=(10, 6))
sns.scatterplot(x='price_numeric', y='Ratings', data=df, alpha=0.6)
plt.title('Price vs. Rating')
plt.xlabel('Price ($)')
plt.ylabel('Rating')
plt.savefig('price_vs_rating.png')
plt.close()

# Review Text Analysis
print("\nReview Text Analysis:")
print("Average word count in reviews:", df['word_count'].mean())

plt.figure(figsize=(10, 6))
sns.histplot(df['word_count'], bins=10, kde=True)
plt.title('Word Count Distribution in Reviews')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.savefig('word_count_distribution.png')
plt.close()

# Word frequency analysis
all_reviews = ' '.join(df['processed_reviews'])
words = all_reviews.split()
word_counts = Counter(words).most_common(20)

print("\nMost common words in reviews:")
print(word_counts)

# Create DataFrame for word count visualization
word_df = pd.DataFrame(word_counts, columns=['word', 'count'])

plt.figure(figsize=(12, 6))
sns.barplot(x='word', y='count', data=word_df)
plt.title('Most Common Words in Reviews')
plt.xlabel('Word')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('common_words.png')
plt.close()

# Sentiment Analysis (since all are positive in this dataset, just showing distribution)
print("\nSentiment Distribution:")
sentiment_counts = df['Sentiment'].value_counts()
print(sentiment_counts)

plt.figure(figsize=(8, 6))
sentiment_counts.plot(kind='bar')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig('sentiment_distribution.png')
plt.close()

# Correlation Matrix for Numeric Features
numeric_cols = ['Ratings', 'price_numeric', 'word_count']
numeric_df = df[numeric_cols]

plt.figure(figsize=(8, 6))
correlation_matrix = numeric_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Features')
plt.tight_layout()
plt.savefig('correlation_matrix.png')
plt.close()

print("\nCorrelation Matrix:")
print(correlation_matrix)

# Advanced Analysis: Term Frequency Analysis by Brand
print("\nTop terms by brand:")
vectorizer = CountVectorizer(max_features=20)

for brand in df['brand'].unique():
    brand_reviews = ' '.join(df[df['brand'] == brand]['processed_reviews'])
    
    # Skip if no reviews
    if not brand_reviews:
        continue
        
    X = vectorizer.fit_transform([brand_reviews])
    terms = vectorizer.get_feature_names_out()
    frequencies = X.toarray()[0]
    
    # Create DataFrame for term frequencies
    term_freq = pd.DataFrame({'term': terms, 'frequency': frequencies})
    term_freq = term_freq.sort_values('frequency', ascending=False)
    
    print(f"\nTop 10 terms for {brand}:")
    print(term_freq.head(10))
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='term', y='frequency', data=term_freq.head(10))
    plt.title(f'Top 10 Terms for {brand}')
    plt.xlabel('Term')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(f'{brand}_top_terms.png')
    plt.close()

# Advanced Analysis: Rating vs Word Count
plt.figure(figsize=(10, 6))
sns.boxplot(x='Ratings', y='word_count', data=df)
plt.title('Word Count by Rating')
plt.xlabel('Rating')
plt.ylabel('Word Count')
plt.savefig('word_count_by_rating.png')
plt.close()

# Model comparison by brand
print("\nModel distribution by brand:")
model_by_brand = pd.crosstab(df['brand'], df['model'])
print(model_by_brand)

# Grouped bar chart for price by model and brand
plt.figure(figsize=(14, 8))
sns.barplot(x='model', y='price_numeric', hue='brand', data=df)
plt.title('Price by Model and Brand')
plt.xlabel('Model')
plt.ylabel('Price ($)')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Brand')
plt.tight_layout()
plt.savefig('price_by_model_brand.png')
plt.close()

# Print summary of findings
print("\n=== Summary of Findings ===")
print(f"1. Dataset contains information on {df.shape[0]} shoe reviews.")
print(f"2. Average rating across all shoes is {df['Ratings'].mean():.2f} out of 5 stars.")
print(f"3. Average price is ${df['price_numeric'].mean():.2f}.")

if not brand_counts.empty:
    print(f"4. Most popular brand is {brand_counts.index[0]} with {brand_counts.iloc[0]} products.")

if not avg_price_by_brand.empty:
    print(f"5. Most expensive brand on average is {avg_price_by_brand.index[0]} at ${avg_price_by_brand.iloc[0]:.2f}.")

if not avg_rating_by_brand.empty:
    print(f"6. All brands maintain high ratings with most at {avg_rating_by_brand.iloc[0]:.2f} stars.")

print(f"7. Correlation between price and rating is {corr.loc['price_numeric', 'Ratings']:.3f}.")
print(f"8. Average word count in reviews is {df['word_count'].mean():.1f} words.")
print(f"9. All reviews in the dataset have a {sentiment_counts.index[0]} sentiment.")
print(f"10. Most common words in reviews include: {', '.join([word for word, count in word_counts[:5]])}")

print("\nEDA complete. Visualizations saved to files.")

Dataset Shape: (2959, 7)

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2959 entries, 0 to 2958
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   brand              2959 non-null   object 
 1   model              2959 non-null   object 
 2   price              2959 non-null   object 
 3   Ratings            2959 non-null   int64  
 4   processed_reviews  2959 non-null   object 
 5   Sentiment          2959 non-null   object 
 6   price_numeric      2959 non-null   float64
dtypes: float64(1), int64(1), object(5)
memory usage: 161.9+ KB
None

Summary Statistics:
           brand      model   price      Ratings processed_reviews Sentiment  \
count       2959       2959    2959  2959.000000              2959      2959   
unique       109        757     383          NaN              2832         3   
top     SKECHERS  Authentic  $79.95          NaN        great shoe  Positive   
freq 

  plt.tight_layout()
  plt.tight_layout()
  plt.savefig('price_by_model_brand.png')



=== Summary of Findings ===
1. Dataset contains information on 2959 shoe reviews.
2. Average rating across all shoes is 4.34 out of 5 stars.
3. Average price is $97.40.
4. Most popular brand is SKECHERS with 325 products.
5. Most expensive brand on average is To Boot New York at $399.00.
6. All brands maintain high ratings with most at 5.00 stars.
7. Correlation between price and rating is -0.020.
8. Average word count in reviews is 19.0 words.
9. All reviews in the dataset have a Positive sentiment.
10. Most common words in reviews include: shoe, comfort, size, great, wear

EDA complete. Visualizations saved to files.


In [13]:
import nltk
nltk.download('stopwords')
# Word cloud of reviews
try:
    from wordcloud import WordCloud
    import re
    from collections import Counter
    
    all_reviews = ' '.join(df['processed_reviews'])
    
    # Remove common English stopwords if nltk is installed
    try:
        from nltk.corpus import stopwords
        stop_words = set(stopwords.words('english'))
        
        def clean_text(text):
            text = text.lower()
            text = re.sub(r'[^\w\s]', '', text)
            words = text.split()
            words = [word for word in words if word not in stop_words]
            return ' '.join(words)
        
        cleaned_reviews = clean_text(all_reviews)
    except:
        # If nltk is not available, just do basic cleaning
        cleaned_reviews = all_reviews.lower()
        cleaned_reviews = re.sub(r'[^\w\s]', '', cleaned_reviews)
    
    wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(cleaned_reviews)
    
    plt.figure(figsize=(12, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of Reviews')
    plt.savefig('review_wordcloud.png')
    plt.close()
    
    # Most common words
    words = cleaned_reviews.split()
    word_counts = Counter(words).most_common(20)
    
    print("\nMost common words in reviews:")
    print(word_counts)
    
    # Create DataFrame for word count visualization
    word_df = pd.DataFrame(word_counts, columns=['word', 'count'])
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x='word', y='count', data=word_df)
    plt.title('Most Common Words in Reviews')
    plt.xlabel('Word')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('common_words.png')
    plt.close()
except ImportError:
    print("WordCloud not installed. Skipping word cloud generation.")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adedi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Most common words in reviews:
[('shoe', 2866), ('comfort', 1187), ('size', 928), ('great', 846), ('wear', 820), ('fit', 796), ('look', 783), ('pair', 637), ('like', 612), ('love', 576), ('feet', 439), ('good', 425), ('feel', 336), ('order', 335), ('walk', 331), ('wide', 323), ('would', 319), ('day', 315), ('get', 310), ('im', 309)]
