In [8]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
# loading data
try:
    data_path = '../data/googleplaystore.csv'
    print(f"Attempting to load data from: {data_path}")

    # --- Load the data ---
    df = pd.read_csv(data_path)

    # --- ESSENTIAL: Perform initial checks ---
    print("\n✅ Data loaded successfully!")
    print(f"Shape of the data (rows, columns): {df.shape}")

    print("\n--- First 5 Rows ---")
    display(df.head())

    print("\n--- Data Info (Column Names, Non-Null Counts, Data Types) ---")
    df.info()

except FileNotFoundError:
    print(f"❌ Error: The file was not found at the path: {data_path}")
    print("Please check that the file exists and the path is correct.")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")


Attempting to load data from: ../data/googleplaystore.csv

✅ Data loaded successfully!
Shape of the data (rows, columns): (10841, 13)

--- First 5 Rows ---


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up



--- Data Info (Column Names, Non-Null Counts, Data Types) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [11]:
df.describe()

Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


In [13]:
df.shape

(10841, 13)

In [14]:
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [15]:
# Create a copy to perform our cleaning operations on
df_cleaned = df.copy()

In [16]:
df_cleaned.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [18]:
median_rating=df_cleaned['Rating'].median()
df_cleaned['Rating']=df_cleaned['Rating'].fillna(median_rating)

In [19]:
df_cleaned.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              1
Price             0
Content Rating    1
Genres            0
Last Updated      0
Current Ver       8
Android Ver       3
dtype: int64

In [20]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [24]:
# Installs
df_cleaned['Installs'] = df_cleaned['Installs'].str.replace('+', '', regex=False).str.replace(',', '', regex=False)
# --- FIX: Handle the "Free" string before converting to numeric ---
df_cleaned['Installs'] = df_cleaned['Installs'].replace('Free', '0')
df_cleaned['Installs'] = pd.to_numeric(df_cleaned['Installs'])

In [26]:
# Size (using the recommended MB standardization)
def convert_size_to_mb(size):
    if isinstance(size, str):
        if 'M' in size:
            return float(size.replace('M', ''))
        if 'k' in size:
            return float(size.replace('k', '')) / 1024.0
    return np.nan

df_cleaned['Size'] = df_cleaned['Size'].apply(convert_size_to_mb)
median_size_mb = df_cleaned['Size'].median()
df_cleaned['Size'] = df_cleaned['Size'].fillna(median_size_mb)

In [27]:
df_cleaned = df_cleaned.drop_duplicates()


In [29]:
df_cleaned.tail(400)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10440,Forgotten Hill: Puppeteer,GAME,4.5,2917,22.0,100000,Free,0,Teen,Adventure,"October 30, 2017",2.6,2.3 and up
10441,Airplane Fly Hawaii,FAMILY,4.1,83427,16.0,1000000,Free,0,Everyone,Simulation,"January 30, 2015",2.6,4.1 and up
10442,FiSwitch,TOOLS,4.8,615,2.0,10000,Paid,1.99,Everyone,Tools,"July 24, 2018",5.2,5.0 and up
10443,Signal Info,TOOLS,4.6,424,3.5,10000,Free,0,Everyone,Tools,"December 19, 2017",0.11,6.0 and up
10444,Signal Spy - Monitor Signal Strength & Data Usage,TOOLS,4.4,875,8.5,100000,Free,0,Everyone,Tools,"December 31, 2017",1.9.9.5,5.1 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53.0,5000,Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,4.3,3,9.5,1000,Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,13.0,1000,Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In [30]:
text_columns = ['App', 'Category', 'Genres', 'Content Rating']

for col in text_columns:
    # 1. Convert to lowercase
    df_cleaned[col] = df_cleaned[col].str.lower()
    # 2. Strip leading/trailing whitespace
    df_cleaned[col] = df_cleaned[col].str.strip()
    # 3. Replace internal multiple spaces with a single space
    df_cleaned[col] = df_cleaned[col].str.replace(r'\s+', ' ', regex=True)

print("--- Text Columns After Cleaning ---")
display(df_cleaned[['App', 'Category', 'Genres']].head())

--- Text Columns After Cleaning ---


Unnamed: 0,App,Category,Genres
0,photo editor & candy camera & grid & scrapbook,art_and_design,art & design
1,coloring book moana,art_and_design,art & design;pretend play
2,"u launcher lite – free live cool themes, hide ...",art_and_design,art & design
3,sketch - draw & paint,art_and_design,art & design
4,pixel draw - number art coloring book,art_and_design,art & design;creativity


In [31]:
df_cleaned['Last Updated'] = pd.to_datetime(df_cleaned['Last Updated'])

# Get the current date
from datetime import datetime
today_date = datetime.now()

# Create the new feature
df_cleaned['Days Since Last Updated'] = (today_date - df_cleaned['Last Updated']).dt.days

# Let's see the new column
print("\n--- New Feature: Days Since Last Updated ---")
display(df_cleaned[['App', 'Last Updated', 'Days Since Last Updated']].head())

ValueError: time data "1.0.19" doesn't match format "%B %d, %Y", at position 1348. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [None]:
df_cleaned.loc[]

App               pregnancy & baby tracker
Category                health_and_fitness
Rating                                 4.6
Reviews                              48286
Size                                  13.0
Installs                           1000000
Type                                  Free
Price                                    0
Content Rating                    everyone
Genres                    health & fitness
Last Updated                 July 27, 2018
Current Ver             Varies with device
Android Ver             Varies with device
Name: 1348, dtype: object