# Amazon Sales Data

**Description:**

This dataset contains information on 1K+ Amazon products, including their ratings, reviews, and other details.

In [None]:
import pandas as pd

df = pd.read_csv('/amazon.csv')

1. What is the average rating for each product category.

In [None]:
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

average_ratings = df.groupby('category')['rating'].mean()

print("Average Rating by Product Category:")
print(average_ratings.round(2))

3. What is the distribution of discounted prices vs. actual prices?

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('amazon.csv')

print(df.isnull().sum())
plt.figure(figsize=(12, 8))
plt.scatter(df['actual_price'], df['discounted_price'], alpha=0.7, c='blue', edgecolors='black')
plt.title('Distribution of Discounted Prices vs. Actual Prices')
plt.xlabel('Actual Price (USD)', fontsize=12)
plt.ylabel('Discounted Price', fontsize=12)
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.show()


4. How does the average discount percentage vary across categories?

In [None]:
import pandas as pd

df['discount_percentage'] = df['discount_percentage'].str.rstrip('%').astype(float)

average_discount_per_category = df.groupby('category')['discount_percentage'].mean()

result_df = average_discount_per_category.to_frame('average_discount')

print(result_df)

5. . What are the most popular product names?

In [None]:
popular_products = df['product_name'].value_counts().head(10)
print(popular_products)

6.  What are the most popular product keywords?

In [None]:
popular_product_keywords= df['product_name'].value_counts().head(10)
print(popular_product_keywords)

7.  What are the most popular product reviews?

In [None]:
df = pd.read_csv('amazon.csv')

most_popular_reviews = df.groupby('product_name')['review_id'].value_counts().head(5)
print(most_popular_reviews)

8. What is the correlation between discounted_price and rating?

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('amazon.csv')

df['discounted_price'] = pd.to_numeric(df['discounted_price'], errors='coerce')
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

correlation_pandas = df['discounted_price'].corr(df['rating'])

print(f"Correlation using pandas: {correlation_pandas}")

9. What are the Top 5 categories based on the highest ratings?

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('amazon.csv')

top_5_categories_highest_rating = df.groupby('category')['rating'].value_counts().head(5)
print(top_5_categories_highest_rating)

10. Identify any potential areas for improvement or optimization based on the data analysis.

#

# Spotify Data

**Description of the Dataset:**

*The dataset titled "Spotify Data: Popular Hip-hop Artists and Tracks" provides a curated collection of approximately 500 entries showcasing the vibrant realm of hip-hop music. These entries meticulously compile the most celebrated hip-hop tracks and artists, reflecting their significant influence on the genre's landscape. Each entry not only highlights the popularity and musical composition of the tracks but also underscores the creative prowess of the artists and their profound impact on global listeners.*

1. Load the dataframe and ensure data quality by checking for missing values and duplicate rows. Handle missing values and remove duplicate rows if necessary.

In [None]:
import pandas as pd

df = pd.read_csv('spotify.csv')

print(df.isnull().sum())

print(df.duplicated().sum())

df = df.drop_duplicates(inplace=False)


2. What is the distribution of popularity among the tracks in the dataset? Visualize it using a histogram.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('spotify.csv')

popularity = df['Popularity']

plt.figure(figsize=(8, 5))
plt.hist(popularity, bins=10, edgecolor='black')
plt.xlabel('Popularity')
plt.ylabel('Number of Tracks')
plt.title('Distribution of Popularity in the Dataset')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

3. Is there any relationship between the popularity and the duration of tracks? Explore this using a scatter plot.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('spotify.csv')

popularity = df['Popularity']
duration_ms = df['Duration (ms)']

plt.figure(figsize=(8, 5))
plt.scatter(duration_ms, popularity, alpha=0.7)

# Add labels and title
plt.xlabel('Duration (ms)')
plt.ylabel('Popularity')
plt.title('Popularity vs. Duration of Tracks')

plt.grid(True)

plt.show()


4. Which artist has the highest number of tracks in the dataset? Display the count of tracks for each artist using a countplot.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('spotify.csv')

artist_counts = df['Artist'].value_counts()

most_tracks_artist = artist_counts.idxmax()
number_of_tracks = artist_counts.max()

plt.figure(figsize=(8, 5))

artist_counts.plot(kind='bar', color='skyblue')
plt.xlabel('Artist')
plt.ylabel('Track Count')
plt.title('Track Count per Artist')

plt.xticks(rotation=45, ha='right')

plt.axhline(y=number_of_tracks, color='red', linestyle='--', label=f'{number_of_tracks} Tracks (Most)')  # Add label for maximum

plt.legend()

plt.tight_layout()
plt.show()

print(f"Artist with the highest number of tracks: {most_tracks_artist} ({number_of_tracks} tracks)")


5. What are the top 5 least popular tracks in the dataset? Provide the artist name and track name for each.

In [None]:
import pandas as pd
df = pd.read_csv('spotify.csv')
least_popular = df.nsmallest(5, 'Popularity')

artist_names = least_popular['Artist'].tolist()
track_names = least_popular['Track Name'].tolist()

print("Top 5 Least Popular Tracks:")
for i in range(5):
    print(f"{i+1}. {artist_names[i]} - {track_names[i]}")


6. Among the top 5 most popular artists, which artist has the highest popularity on average? Calculate and display the average popularity for each artist.

In [None]:
import pandas as pd

df = pd.read_csv('spotify.csv')


top_artists = df.nlargest(5, 'Popularity')

average_popularity = top_artists.groupby('Artist')['Popularity'].mean()

highest_avg_artist = average_popularity.idxmax()
highest_avg_popularity = average_popularity.max()

print("Average Popularity for Top 5 Artists:")
print(average_popularity)
print(f"\nArtist with Highest Average Popularity: {highest_avg_artist} ({highest_avg_popularity:.2f})")


7. For the top 5 most popular artists, what are their most popular tracks? List the track name for each artist.

In [None]:
import pandas as pd

df = pd.read_csv('spotify.csv')

top_artists = df.groupby('Artist')['Popularity'].max().sort_values(ascending=False).head(5).index

most_popular_tracks = {}

for artist in top_artists:
    artist_tracks = df[df['Artist'] == artist]
    most_popular_track = artist_tracks.loc[artist_tracks['Popularity'].idxmax()]
    most_popular_tracks[artist] = most_popular_track['Track Name']

most_popular_tracks



8. Visualize relationships between multiple numerical variables simultaneously using a pair plot.

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('spotify.csv')

if not pd.api.types.is_numeric_dtype(df.select_dtypes(include=[np.number])):
    print("Error: Not all columns are numerical. Pair plots require numerical data.")
    exit()

sns.pairplot(df)
plt.show()


9. Does the duration of tracks vary significantly across different artists? Explore this visually using a box plot or violin plot.

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('spotify.csv')

plt.figure(figsize=(8, 6))
sns.boxplot(
    x = "Artist",
    y = "Duration (ms)",
    showmeans=True,
    data=df
)
plt.xlabel("Artist")
plt.ylabel("Duration (ms)")
plt.title("Distribution of Track Duration Across Artists (Box Plot)")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()



10. How does the distribution of track popularity vary for different artists? Visualize this using a swarm plot or a violin plot.

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('spotify.csv')

plt.figure(figsize=(8, 6))
sns.swarmplot(
    x = "Artist",
    y = "Popularity",
    data=df,
    size=8,
    hue="Artist"
)
plt.xlabel("Artist")
plt.ylabel("Popularity")
plt.title("Distribution of Track Popularity Across Artists (Swarm Plot)")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()