In [None]:
# ============================================
# Google Play Store Data Analytics Project
# Level 2 â€“ Project 4
# ============================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# --------------------------------------------
# 1. Load Datasets
# --------------------------------------------
apps = pd.read_csv("apps.csv")
reviews = pd.read_csv("user_reviews.csv")

print("Apps dataset shape:", apps.shape)
print("Reviews dataset shape:", reviews.shape)

# --------------------------------------------
# 2. Data Cleaning
# --------------------------------------------

# Clean Installs column
apps['Installs'] = apps['Installs'].astype(str)
apps['Installs'] = apps['Installs'].str.replace('+', '', regex=False)
apps['Installs'] = apps['Installs'].str.replace(',', '', regex=False)
apps['Installs'] = pd.to_numeric(apps['Installs'], errors='coerce')

# Clean Price column
apps['Price'] = apps['Price'].astype(str)
apps['Price'] = apps['Price'].str.replace('$', '', regex=False)
apps['Price'] = pd.to_numeric(apps['Price'], errors='coerce')

# Clean Size column
apps['Size'] = apps['Size'].astype(str)
apps.loc[apps['Size'].str.contains('Varies', na=False), 'Size'] = np.nan
apps['Size'] = apps['Size'].str.replace('M', '', regex=False)
apps['Size'] = apps['Size'].str.replace('k', '', regex=False)
apps['Size'] = pd.to_numeric(apps['Size'], errors='coerce')

# Drop missing values
apps.dropna(inplace=True)

print("Cleaned dataset shape:", apps.shape)

# --------------------------------------------
# 3. Category Exploration
# --------------------------------------------
plt.figure()
apps['Category'].value_counts().head(10).plot(kind='bar')
plt.title("Top 10 App Categories")
plt.xlabel("Category")
plt.ylabel("Number of Apps")
plt.show()

# --------------------------------------------
# 4. Metrics Analysis
# --------------------------------------------

# Ratings distribution
plt.figure()
apps.boxplot(column='Rating')
plt.title("App Ratings Distribution")
plt.ylabel("Rating")
plt.show()

# Average installs by category
plt.figure()
apps.groupby('Category')['Installs'] \
    .mean() \
    .sort_values(ascending=False) \
    .head(10) \
    .plot(kind='bar')

plt.title("Average Installs by Category")
plt.xlabel("Category")
plt.ylabel("Average Installs")
plt.show()

# --------------------------------------------
# 5. Sentiment Analysis
# --------------------------------------------
plt.figure()
reviews['Sentiment'].value_counts().plot(kind='bar')
plt.title("User Review Sentiment Analysis")
plt.xlabel("Sentiment")
plt.ylabel("Number of Reviews")
plt.show()

# --------------------------------------------
# 6. Key Insights Summary
# --------------------------------------------
summary = {
    "Total Apps Analyzed": len(apps),
    "Most Popular Category": apps['Category'].value_counts().idxmax(),
    "Average Rating": round(apps['Rating'].mean(), 2),
    "Free Apps Percentage": round(
        apps['Type'].value_counts(normalize=True).get('Free', 0) * 100, 2
    )
}

print("\nProject Summary:")
for key, value in summary.items():
    print(f"{key}: {value}")
