# 📱 Google Play Store Data Analysis
This project analyzes free Android apps available on the Google Play Store to identify app profiles that are more likely to attract users. This helps developers at an ad-based company make data-driven decisions.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid')

## 📥 Load and Inspect Data

In [None]:
store_df = pd.read_csv('../data/googleplaystore.csv')
reviews_df = pd.read_csv('../data/googleplaystore_user_reviews.csv')

store_df.head()

## 🧹 Data Cleaning
We'll clean the dataset to focus only on free apps with valid installs and reviews.

In [None]:

store_clean = store_df.copy()
store_clean.drop_duplicates(inplace=True)
store_clean = store_clean.dropna(subset=['Rating', 'Reviews', 'Installs', 'Type'])
store_clean = store_clean[store_clean['Type'] == 'Free']
store_clean['Installs'] = store_clean['Installs'].str.replace('[+,]', '', regex=True).astype(int)
store_clean['Reviews'] = pd.to_numeric(store_clean['Reviews'], errors='coerce')
store_clean = store_clean.dropna(subset=['Reviews'])
store_clean['Reviews'] = store_clean['Reviews'].astype(int)
store_clean.reset_index(drop=True, inplace=True)
store_clean.info()


## 📊 Top Categories by Total Installs

In [None]:

top_categories = store_clean.groupby('Category')['Installs'].sum().sort_values(ascending=False).reset_index()
top_categories.head(10)


In [None]:

plt.figure(figsize=(12,6))
sns.barplot(data=top_categories.head(10), x='Installs', y='Category', palette='viridis')
plt.title('Top 10 Categories by Installs')
plt.xlabel('Total Installs')
plt.ylabel('Category')
plt.tight_layout()
plt.show()


## 🌟 Ratings vs Installs
Are highly rated apps downloaded more?

In [None]:

plt.figure(figsize=(10,6))
sns.scatterplot(data=store_clean, x='Rating', y='Installs', alpha=0.5)
plt.title('Rating vs Installs')
plt.xlabel('App Rating')
plt.ylabel('Number of Installs')
plt.tight_layout()
plt.show()


## 🎮 Genre-wise Average Installs

In [None]:

genre_avg = store_clean.groupby('Genres')['Installs'].mean().sort_values(ascending=False).reset_index()
genre_avg.head(10)
