In [23]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# Set display and style
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# 1. Load dataset
df = pd.read_csv("vgsales.csv")
print(f"Original shape: {df.shape}")

# 2. Drop rows with missing critical data
df_clean = df.dropna(subset=["Name", "Platform", "Year", "Genre", "Publisher"])
df_clean["Year"] = df_clean["Year"].astype(int)
print(f"Cleaned shape: {df_clean.shape}")

# 3. Save cleaned dataset for dashboard
df_clean.to_csv("cleaned_vgsales.csv", index=False)
print("Saved cleaned_vgsales.csv")

# 4. Summary statistics
print(df_clean.describe())

# 5. Value counts
print("\nTop Platforms:\n", df_clean["Platform"].value_counts().head())
print("\nTop Genres:\n", df_clean["Genre"].value_counts().head())
print("\nTop Publishers:\n", df_clean["Publisher"].value_counts().head())

# 6. Global Sales Over Time
sales_per_year = df_clean.groupby("Year")["Global_Sales"].sum().reset_index()
sns.lineplot(data=sales_per_year, x="Year", y="Global_Sales", marker="o")
plt.title("Global Video Game Sales Over Years")
plt.xlabel("Year")
plt.ylabel("Sales (Millions)")
plt.tight_layout()
plt.savefig("global_sales_over_years.png")
plt.clf()

# 7. Top Genres by Global Sales
genre_sales = df_clean.groupby("Genre")["Global_Sales"].sum().sort_values(ascending=False).reset_index()
sns.barplot(data=genre_sales, x="Global_Sales", y="Genre", palette="viridis")
plt.title("Top Genres by Global Sales")
plt.xlabel("Sales (Millions)")
plt.ylabel("Genre")
plt.tight_layout()
plt.savefig("genre_sales_bar.png")
plt.clf()

# 8. Platform Popularity (Top 10)
platform_counts = df_clean["Platform"].value_counts().head(10).reset_index()
platform_counts.columns = ["Platform", "Count"]
sns.barplot(data=platform_counts, x="Platform", y="Count", palette="pastel")
plt.title("Top 10 Platforms by Game Count")
plt.tight_layout()
plt.savefig("platform_counts.png")
plt.clf()

# 9. Correlation Heatmap
numeric_cols = ["NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales", "Global_Sales"]
corr = df_clean[numeric_cols].corr()
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Sales Correlation Heatmap")
plt.tight_layout()
plt.savefig("correlation_heatmap.png")
plt.clf()

# 10. Boxplot: Global Sales by Genre
sns.boxplot(data=df_clean, x="Genre", y="Global_Sales", palette="Set2")
plt.xticks(rotation=45)
plt.title("Global Sales Distribution by Genre")
plt.tight_layout()
plt.savefig("boxplot_genre_sales.png")
plt.clf()

# 11. Plotly Interactive Bar Chart (if you want to launch in browser)
fig = px.bar(
    genre_sales,
    x="Genre",
    y="Global_Sales",
    title="Interactive: Global Sales by Genre",
    color="Global_Sales",
    color_continuous_scale="Blues"
)
fig.write_html("interactive_genre_sales.html")
print("Saved interactive chart: interactive_genre_sales.html")


Original shape: (16598, 11)
Cleaned shape: (16291, 11)
Saved cleaned_vgsales.csv
               Rank          Year      NA_Sales      EU_Sales      JP_Sales  \
count  16291.000000  16291.000000  16291.000000  16291.000000  16291.000000   
mean    8290.190228   2006.405561      0.265647      0.147731      0.078833   
std     4792.654450      5.832412      0.822432      0.509303      0.311879   
min        1.000000   1980.000000      0.000000      0.000000      0.000000   
25%     4132.500000   2003.000000      0.000000      0.000000      0.000000   
50%     8292.000000   2007.000000      0.080000      0.020000      0.000000   
75%    12439.500000   2010.000000      0.240000      0.110000      0.040000   
max    16600.000000   2020.000000     41.490000     29.020000     10.220000   

        Other_Sales  Global_Sales  
count  16291.000000  16291.000000  
mean       0.048426      0.540910  
std        0.190083      1.567345  
min        0.000000      0.010000  
25%        0.000000      0.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.





Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.





Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.




Saved interactive chart: interactive_genre_sales.html
