In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Load dataset
data = pd.read_csv("/content/7817_1.csv")

# Basic info about the dataset
print("First 5 rows of the dataset:")
print(data.head())

print("\nData Cleaning and Preparation")

# Selecting relevant columns
selected_columns = ["reviews.text", "reviews.rating", "reviews.doRecommend", "reviews.date", "categories", "name"]
data_cleaned = data[selected_columns].copy()

# Display head of cleaned data
print(data_cleaned.head())

# Handling missing values
data_cleaned = data_cleaned.dropna(subset=["reviews.text", "reviews.rating", "categories"])

# Converting ratings to numeric
data_cleaned["reviews.rating"] = pd.to_numeric(data_cleaned["reviews.rating"], errors="coerce")

print("\nCleaned Dataset Info:")
print(data_cleaned.info())

# Sentiment Analysis
print("\nSentiment Analysis")

# Function to determine sentiment
def analyze_sentiment(review):
    polarity = TextBlob(review).sentiment.polarity
    if polarity > 0:
        return "Positive"
    elif polarity < 0:
        return "Negative"
    else:
        return "Neutral"

# Applying sentiment analysis
data_cleaned["sentiment"] = data_cleaned["reviews.text"].apply(analyze_sentiment)

# Sentiment distribution
sentiment_distribution = data_cleaned["sentiment"].value_counts()
print("\nSentiment Distribution:")
print(sentiment_distribution)

# Sentiment Distribution Pie Chart
plt.figure(figsize=(6, 6))
sentiment_distribution.plot.pie(autopct="%.1f%%", startangle=90, labels=sentiment_distribution.index)
plt.title("Sentiment Distribution")
plt.ylabel("")
plt.show()

# Sentiment by Category
sentiment_by_category = data_cleaned.groupby(["categories", "sentiment"]).size().unstack(fill_value=0)

# Stacked bar chart
sentiment_by_category.plot(kind="bar", stacked=True, figsize=(12, 6))
plt.title("Sentiment Distribution by Category")
plt.xlabel("Categories")
plt.ylabel("Number of Reviews")
plt.legend(title="Sentiment")
plt.tight_layout()
plt.show()

# Average Ratings by Category
avg_rating_by_category = data_cleaned.groupby("categories")["reviews.rating"].mean().sort_values()

# Horizontal bar chart
avg_rating_by_category.plot(kind="barh", figsize=(8, 6), color="skyblue")
plt.title("Average Ratings by Category")
plt.xlabel("Average Rating")
plt.ylabel("Categories")
plt.tight_layout()
plt.show()

# Sentiment Trends Over Time
data_cleaned["reviews.date"] = pd.to_datetime(data_cleaned["reviews.date"], errors="coerce")
sentiment_trends = data_cleaned.groupby([data_cleaned["reviews.date"].dt.to_period("M"), "sentiment"]).size().unstack(fill_value=0)

# Line chart for sentiment trends
sentiment_trends.plot(kind="line", figsize=(12, 6), marker="o")
plt.title("Sentiment Trends Over Time")
plt.xlabel("Date")
plt.ylabel("Number of Reviews")
plt.legend(title="Sentiment")
plt.grid()
plt.tight_layout()
plt.show()

# Key Insights
print("\nKey Insights")
print("\nSentiment by Category:")
print(sentiment_by_category)

# Highest and Lowest Satisfaction
highest_satisfaction = avg_rating_by_category.idxmax()
lowest_satisfaction = avg_rating_by_category.idxmin()
print(f"Category with Highest Satisfaction: {highest_satisfaction}")
print(f"Category with Lowest Satisfaction: {lowest_satisfaction}")

# Feature Importance using Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Data Preparation for Random Forest
X = data[["review_length", "sentiment_score", "reviews.numHelpful", "prices"]]
y = (data["reviews.rating"] > 3).astype(int)

# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X, y)

# Get feature importances
importances = rf_model.feature_importances_
feature_names = X.columns

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=feature_names, palette="viridis")
plt.title("Feature Importance in Random Forest Model")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()

# Example Data Preparation
np.random.seed(42)
ratings = np.random.choice([1, 2, 3, 4, 5], size=500)
review_lengths = np.random.exponential(scale=300, size=500)
data_cleaned["review_length"] = data_cleaned["reviews.text"].str.len()

# Scatterplot for review length by rating
plt.scatter(data_cleaned["review_length"], data_cleaned["reviews.rating"], color="orange", alpha=0.5, s=10)
plt.title("Review Length vs Rating")
plt.xlabel("Review Length")
plt.ylabel("Rating")
plt.show()

# Correlation Heatmap
plt.figure(figsize=(8, 6))
correlation = data.corr()
sns.heatmap(correlation, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation Heatmap of Key Features")
plt.show()
