<p style="text-align:center; color:#0000ff; font-weight:bold; font-size:30px;">Google Play Store Analysis</p>

### **Importing Essential Libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### **Reading CSV Files**

In [2]:
df = pd.read_csv('googleplaystore.csv')
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


### **1. Summary of the Data**

In [3]:
#shape of the store data
df.shape

(10841, 13)

In [4]:
#concise summary of the store data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


### **2. Handling Null Values**

In [16]:
# Replace 'Varies with device' in df['size'] with NaN
df['Size'] = df['Size'].replace('Varies with device', pd.NA)

# Replace 'Unrated' in df['Content Rating'] with NaN
df['Content Rating'] = df['Content Rating'].replace('Unrated', pd.NA)

# Replace 'Varies with device' in df['Current Ver'] with NaN
df['Current Ver'] = df['Current Ver'].replace('Varies with device', pd.NA)

# Replace 'Varies with device' in df['Android Ver'] with NaN
df['Android Ver'] = df['Android Ver'].replace('Varies with device', pd.NA)

In [17]:
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size              1695
Installs             0
Type                 1
Price                0
Content Rating       3
Genres               0
Last Updated         0
Current Ver       1467
Android Ver       1365
dtype: int64

In [18]:
# Drop rows with null values
df.dropna(inplace=True)
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,5.6M,"50,000+",Free,0,Everyone,Art & Design,"March 26, 2017",1.0,2.3 and up


In [19]:
df.shape

(7636, 13)

### **3. Correcting the Data types**

In [20]:
df.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [21]:
# Convert 'Reviews' column to numeric
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')

# Remove the '+' and ',' from the 'Installs' column and convert it to numeric
df['Installs'] = df['Installs'].str.replace('+', '').str.replace(',', '').astype(int)

# Convert 'Price' column to numeric after removing the '$' sign
df['Price'] = df['Price'].str.replace('$', '').astype(float)

# Convert 'Last Updated' column to datetime format
df['Last Updated'] = pd.to_datetime(df['Last Updated'], format='%B %d, %Y')

##### Converting the size of all App's  to kilobytes

In [22]:
#Define a function to convert size from M to K
def convert_to_k(size):
    if 'M' in size:
        return float(size.replace('M', '')) * 1024
    elif 'k' in size:
        return float(size.replace('k', ''))

In [23]:
# Apply the conversion function to the 'Size' column
df['Size'] = df['Size'].apply(convert_to_k)

# Rename the 'Size' column to 'Size_KB'
df.rename(columns={'Size': 'Size_KB'}, inplace=True)

In [24]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size_KB,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19456.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14336.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8908.8,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2867.2,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,5734.4,50000,Free,0.0,Everyone,Art & Design,2017-03-26,1.0,2.3 and up


In [25]:
df.dtypes

App                       object
Category                  object
Rating                   float64
Reviews                    int64
Size_KB                  float64
Installs                   int32
Type                      object
Price                    float64
Content Rating            object
Genres                    object
Last Updated      datetime64[ns]
Current Ver               object
Android Ver               object
dtype: object

### **4. Handling Duplicate Values**

In [26]:
df[df.duplicated()]

Unnamed: 0,App,Category,Rating,Reviews,Size_KB,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
256,ZOOM Cloud Meetings,BUSINESS,4.4,31614,37888.0,10000000,Free,0.0,Everyone,Business,2018-07-20,4.1.28165.0716,4.0 and up
266,Zenefits,BUSINESS,4.2,296,14336.0,50000,Free,0.0,Everyone,Business,2018-06-15,3.2.1,4.1 and up
267,Google Ads,BUSINESS,4.3,29313,20480.0,5000000,Free,0.0,Everyone,Business,2018-07-30,1.12.0,4.0.3 and up
270,FreshBooks Classic,BUSINESS,4.1,1802,26624.0,100000,Free,0.0,Everyone,Business,2018-04-18,1.7.14,4.2 and up
271,Insightly CRM,BUSINESS,3.8,1383,52224.0,100000,Free,0.0,Everyone,Business,2018-07-12,3.24.1,5.0 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7885,CT Scan Cross Sectional Anatomy,MEDICAL,4.3,10,47104.0,100,Free,0.0,Everyone,Medical,2018-07-19,5.0.16,4.1 and up
8632,Notepad & To do list,PRODUCTIVITY,4.3,226295,4300.8,10000000,Free,0.0,Everyone,Productivity,2018-06-06,4.3.19,2.3.3 and up
8635,"Polaris Office - Word, Docs, Sheets, Slide, PDF",PRODUCTIVITY,4.3,549900,61440.0,10000000,Free,0.0,Everyone,Productivity,2018-07-18,7.3.30,4.1 and up
10049,Airway Ex - Intubate. Anesthetize. Train.,MEDICAL,4.3,123,88064.0,10000,Free,0.0,Everyone,Medical,2018-06-01,0.6.88,5.0 and up


In [27]:
# Remove duplicate rows
df.drop_duplicates(inplace=True)

In [28]:
df.shape

(7336, 13)

In [29]:
df['App'].value_counts().reset_index()

Unnamed: 0,App,count
0,ROBLOX,9
1,8 Ball Pool,7
2,Bubble Shooter,6
3,Helix Jump,6
4,Zombie Catchers,6
...,...,...
6947,Drag'n'Boom,1
6948,N-Com Wizard,1
6949,Tap N Pay,1
6950,Rope'n'Fly 3 - Dusk Till Dawn,1


In [30]:
df[df['App'] == 'ROBLOX']

Unnamed: 0,App,Category,Rating,Reviews,Size_KB,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
1653,ROBLOX,GAME,4.5,4447388,68608.0,100000000,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up
1701,ROBLOX,GAME,4.5,4447346,68608.0,100000000,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up
1748,ROBLOX,GAME,4.5,4448791,68608.0,100000000,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up
1841,ROBLOX,GAME,4.5,4449882,68608.0,100000000,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up
1870,ROBLOX,GAME,4.5,4449910,68608.0,100000000,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up
2016,ROBLOX,FAMILY,4.5,4449910,68608.0,100000000,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up
2088,ROBLOX,FAMILY,4.5,4450855,68608.0,100000000,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up
2206,ROBLOX,FAMILY,4.5,4450890,68608.0,100000000,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up
4527,ROBLOX,FAMILY,4.5,4443407,68608.0,100000000,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up


   The rows above remain duplicates, differing solely in the 'reviews' column. To resolve this, we'll keep only the rows with the **highest number of reviews for 'App' count greater than 1** and **all the rows with count 1**.

In [31]:
#Filter the DataFrame for apps with counts greater than 1
app_counts = df['App'].value_counts()
duplicates = app_counts[app_counts >1].index

#Find the row index corresponding to the maximum review count for each app
max_review_indices = df[df['App'].isin(duplicates)].groupby('App')['Reviews'].idxmax()

#Create a list of row indices to retain
indices_to_keep = list(max_review_indices)

#Filter the original DataFrame to retain only the rows with maximum reviews for each app
df_max_reviews = df.loc[indices_to_keep]

df_max_reviews

Unnamed: 0,App,Category,Rating,Reviews,Size_KB,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
3083,365Scores - Live Scores,SPORTS,4.6,666521,25600.0,10000000,Free,0.0,Everyone,Sports,2018-07-29,5.5.9,4.1 and up
1871,8 Ball Pool,GAME,4.5,14201891,53248.0,100000000,Free,0.0,Everyone,Sports,2018-07-31,4.0.0,4.0.3 and up
662,95Live -SG#1 Live Streaming App,DATING,4.1,4954,15360.0,1000000,Free,0.0,Teen,Dating,2018-08-01,8.7.2,4.2 and up
4991,A&E - Watch Full Episodes of TV Shows,FAMILY,4.0,29708,19456.0,1000000,Free,0.0,Teen,Entertainment,2018-07-16,3.1.4,4.4 and up
3799,AC - Tips & News for Android™,NEWS_AND_MAGAZINES,4.2,23292,14336.0,1000000,Free,0.0,Everyone 10+,News & Magazines,2018-05-24,3.1.12,4.1 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,"ooVoo Video Calls, Messaging & Stories",SOCIAL,4.3,1157004,34816.0,50000000,Free,0.0,Everyone,Social,2017-10-16,4.2.1,4.3 and up
3334,osmino Wi-Fi: free WiFi,TOOLS,4.2,134203,4198.4,10000000,Free,0.0,Everyone,Tools,2018-08-06,6.06.14,4.4 and up
565,stranger chat - anonymous chat,DATING,3.5,13204,6246.4,1000000,Free,0.0,Mature 17+,Dating,2018-07-07,2.4.1,4.1 and up
2637,textPlus: Free Text & Calls,SOCIAL,4.1,382121,28672.0,10000000,Free,0.0,Everyone,Social,2018-07-26,7.3.1,4.1 and up


In [32]:
#Filter the DataFrame for apps with counts equal to 1
app_counts_one = df['App'].value_counts()
unique_apps = app_counts_one[app_counts_one == 1].index

#Filter the original DataFrame to retain only the rows with unique apps
df_unique_apps = df[df['App'].isin(unique_apps)]
df_unique_apps

Unnamed: 0,App,Category,Rating,Reviews,Size_KB,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19456.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8908.8,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2867.2,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,5734.4,50000,Free,0.0,Everyone,Art & Design,2017-03-26,1.0,2.3 and up
6,Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,19456.0,50000,Free,0.0,Everyone,Art & Design,2018-04-26,1.1,4.0.3 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10832,FR Tides,WEATHER,3.8,1195,582.0,100000,Free,0.0,Everyone,Weather,2014-02-16,6.0,2.1 and up
10833,Chemin (fr),BOOKS_AND_REFERENCE,4.8,44,619.0,1000,Free,0.0,Everyone,Books & Reference,2014-03-23,0.8,2.2 and up
10834,FR Calculator,FAMILY,4.0,7,2662.4,500,Free,0.0,Everyone,Education,2017-06-18,1.0.0,4.1 and up
10836,Sya9a Maroc - FR,FAMILY,4.5,38,54272.0,5000,Free,0.0,Everyone,Education,2017-07-25,1.48,4.1 and up


In [33]:
# Concatenate df_max_reviews with df_unique_apps
df_final = pd.concat([df_max_reviews, df_unique_apps], ignore_index=True)

# Display the resulting DataFrame
df_final

Unnamed: 0,App,Category,Rating,Reviews,Size_KB,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,365Scores - Live Scores,SPORTS,4.6,666521,25600.0,10000000,Free,0.0,Everyone,Sports,2018-07-29,5.5.9,4.1 and up
1,8 Ball Pool,GAME,4.5,14201891,53248.0,100000000,Free,0.0,Everyone,Sports,2018-07-31,4.0.0,4.0.3 and up
2,95Live -SG#1 Live Streaming App,DATING,4.1,4954,15360.0,1000000,Free,0.0,Teen,Dating,2018-08-01,8.7.2,4.2 and up
3,A&E - Watch Full Episodes of TV Shows,FAMILY,4.0,29708,19456.0,1000000,Free,0.0,Teen,Entertainment,2018-07-16,3.1.4,4.4 and up
4,AC - Tips & News for Android™,NEWS_AND_MAGAZINES,4.2,23292,14336.0,1000000,Free,0.0,Everyone 10+,News & Magazines,2018-05-24,3.1.12,4.1 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6947,FR Tides,WEATHER,3.8,1195,582.0,100000,Free,0.0,Everyone,Weather,2014-02-16,6.0,2.1 and up
6948,Chemin (fr),BOOKS_AND_REFERENCE,4.8,44,619.0,1000,Free,0.0,Everyone,Books & Reference,2014-03-23,0.8,2.2 and up
6949,FR Calculator,FAMILY,4.0,7,2662.4,500,Free,0.0,Everyone,Education,2017-06-18,1.0.0,4.1 and up
6950,Sya9a Maroc - FR,FAMILY,4.5,38,54272.0,5000,Free,0.0,Everyone,Education,2017-07-25,1.48,4.1 and up


In [34]:
# Save the cleaned DataFrame to a CSV file
df_final.to_csv('cleaned_data.csv', index=False)