In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/google-play-store-apps/googleplaystore.csv
/kaggle/input/google-play-store-apps/license.txt
/kaggle/input/google-play-store-apps/googleplaystore_user_reviews.csv


**Step 1: Select a real-world dataset**

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("/kaggle/input/google-play-store-apps/googleplaystore.csv")  # Load dataset

In [4]:
df.head()  # Display first 5 rows

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


**Step 2: Perform data preparation & cleaning**

In [5]:
df.shape  # Shape of dataset (rows, cols)

(10841, 13)

In [6]:
df.info()  # Info about datatypes & missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [7]:
df.describe()  # Summary statistics for numeric columns

Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


In [8]:
df.isnull().sum()  # Missing values count

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [9]:
df.drop_duplicates(inplace=True)  # Drop duplicates

In [10]:
df["Rating"] = pd.to_numeric(df["Rating"], errors="coerce")  # Cleaning 'Rating' column & converting to numeric

In [11]:
df["Installs"] = (
    df["Installs"]
    .str.replace(r"[+,]", "", regex=True)   # remove + and ,
    .str.replace("Free", "", regex=False)   # remove 'Free'
    .str.strip()                            # remove extra spaces
)

In [12]:
df["Installs"] = pd.to_numeric(df["Installs"], errors="coerce")

In [13]:
df["Price"] = df["Price"].str.replace("$", "", regex=False).str.strip()  # Remove $ sign and strip spaces

df["Price"] = pd.to_numeric(df["Price"], errors="coerce")  # Convert to numeric safely (non-numeric like 'Everyone' → NaN)

In [14]:
def size_to_mb(x):  # Clean 'Size' column → convert to MB
    if isinstance(x, str):
        if "M" in x:
            return float(x.replace("M",""))
        elif "k" in x:
            return float(x.replace("k",""))/1024
        elif x == "Varies with device":
            return None
    return x

df["Size_MB"] = df["Size"].apply(size_to_mb)

In [15]:
df["Size_MB"] = pd.to_numeric(df["Size_MB"], errors="coerce")  # Convert Size_MB to numeric, turn invalid strings into NaN

df["Size_MB"] = df["Size_MB"].fillna(df["Size_MB"].mean())  # Now fill missing with mean

In [16]:
df["Rating"].fillna(df["Rating"].mean(), inplace=True)  # Fill missing values (Rating with mean, others with mode)
df["Size_MB"].fillna(df["Size_MB"].mean(), inplace=True)
df["Type"].fillna(df["Type"].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Rating"].fillna(df["Rating"].mean(), inplace=True)  # Fill missing values (Rating with mean, others with mode)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Size_MB"].fillna(df["Size_MB"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method 

**Step 3: Perform exploratory analysis & visualization Matplotlib - Resources**

In [17]:
plt.figure(figsize=(6,4))
sns.histplot(df["Rating"], bins=20, kde=True, color="blue")
plt.title("Distribution of App Ratings")
plt.show()

NameError: name 'plt' is not defined

In [None]:
top_categories = df["Category"].value_counts().head(10)
sns.barplot(y=top_categories.index, x=top_categories.values, palette="viridis")
plt.title("Top 10 App Categories")
plt.xlabel("Number of Apps")
plt.ylabel("Category")
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(x="Installs", y="Rating", data=df, alpha=0.5)
plt.xscale("log")
plt.title("App Installs vs Ratings")
plt.show()

In [None]:
df["Type"].value_counts().plot(kind="pie", autopct="%1.1f%%", startangle=90, colors=["skyblue","orange"])
plt.title("Free vs Paid Apps")
plt.ylabel("")
plt.show()

**Step 4: Ask & answer questions about the data**

**Q1: Which category has the most apps?**

In [None]:
most_apps = df["Category"].value_counts().idxmax()
print("Category with most apps:", most_apps)

**Q2: What is the average rating of apps?**

In [None]:
avg_rating = df["Rating"].mean()
print(f"Average App Rating: {avg_rating:.2f}")

**Q3: Do paid apps have higher ratings than free apps?**

In [None]:
sns.boxplot(x="Type", y="Rating", data=df, palette="Set2")
plt.title("App Ratings: Free vs Paid")
plt.show()

print(df.groupby("Type")["Rating"].mean())

**Q4: Which apps have the highest installs?**

In [None]:
top_installs = df[["App","Installs"]].sort_values(by="Installs", ascending=False).head(5)
print("Top Installed Apps:")
print(top_installs)

**Step 5: Summarize your inferences & write a conclusion**

Many columns have missing or inconsistent values. Size column has mixed formats like MB, KB, or “Varies with device”.
Installs column contains symbols like “+” and “,” that need cleaning. Price column sometimes has invalid entries mixed in. Most apps target “Everyone” as content rating.Productivity, Lifestyle, and Communication are also popular categories.
Only a small fraction of apps have ratings above 4.5. A few popular apps dominate download counts, while most apps have under 1M installs. Free apps rely heavily on ads and in-app purchases for revenue.