In [1]:
import os
import pandas as pd # For handling the data
import numpy as np # For calculations
import matplotlib.pyplot as plt # For making charts
import seaborn as sns # For making fancy charts

In [2]:
current_directory = os.getcwd()
print("Current Directory:", current_directory)

Current Directory: /Users/amarachiordor/Documents/HNG Data Analysis/Datasets


In [3]:
files = os.listdir()
print("Files in Directory:", files)

Files in Directory: ['googleplaystore_user_reviews.csv', 'googleplaystore.csv', 'Google Play Store.ipynb', 'appleAppData.csv', '.ipynb_checkpoints', 'Google-Playstore.csv']


In [4]:
pip install pandas googletrans==4.0.0-rc1 langdetect

Note: you may need to restart the kernel to use updated packages.


In [None]:
df = pd.read_csv("Google-playstore.csv")

# Display basic info
print(df.info())

# Check for missing values
print(df.isnull().sum())

# Check for duplicates
print("Duplicate rows:", df.duplicated().sum())

# Display first few rows
df.head()

In [6]:
df.shape

(2312944, 24)

In [7]:
df.info

<bound method DataFrame.info of                                                   App Name  \
0                                                  Gakondo   
1                                      Ampere Battery Info   
2                                                   Vibook   
3        Smart City Trichy Public Service Vehicles 17UC...   
4                                                  GROW.me   
...                                                    ...   
2312939                                           大俠客—熱血歸來   
2312940                                         ORU Online   
2312941                                     Data Structure   
2312942                                        Devi Suktam   
2312943                       Biliyor Musun - Sonsuz Yarış   

                                       App Id       Category  Rating  \
0                         com.ishakwe.gakondo      Adventure     0.0   
1                  com.webserveis.batteryinfo          Tools     4.4   
2      

In [8]:
df.duplicated().sum()

0

In [9]:
print(df.isnull().sum())

App Name                  5
App Id                    0
Category                  0
Rating                22883
Rating Count          22883
Installs                107
Minimum Installs        107
Maximum Installs          0
Free                      0
Price                     0
Currency                135
Size                    196
Minimum Android        6530
Developer Id             33
Developer Website    760835
Developer Email          31
Released              71053
Last Updated              0
Content Rating            0
Privacy Policy       420953
Ad Supported              0
In App Purchases          0
Editors Choice            0
Scraped Time              0
dtype: int64


In [10]:
missing_percentage = df.isnull().sum() / len(df) * 100
print(missing_percentage)

App Name              0.000216
App Id                0.000000
Category              0.000000
Rating                0.989345
Rating Count          0.989345
Installs              0.004626
Minimum Installs      0.004626
Maximum Installs      0.000000
Free                  0.000000
Price                 0.000000
Currency              0.005837
Size                  0.008474
Minimum Android       0.282324
Developer Id          0.001427
Developer Website    32.894657
Developer Email       0.001340
Released              3.071972
Last Updated          0.000000
Content Rating        0.000000
Privacy Policy       18.199879
Ad Supported          0.000000
In App Purchases      0.000000
Editors Choice        0.000000
Scraped Time          0.000000
dtype: float64


In [11]:
df.drop(columns=["Developer Website", "Privacy Policy"], inplace=True)

In [12]:
df.loc[:, "Released"] = df["Released"].fillna(df["Released"].mode()[0])
df.loc[:, "Minimum Android"] = df["Minimum Android"].fillna(df["Minimum Android"].mode()[0])
df.loc[:, "Rating"] = df["Rating"].fillna(df["Rating"].median())
df.loc[:, "Rating Count"] = df["Rating Count"].fillna(df["Rating Count"].median())
df.loc[:, "Currency"] = df["Currency"].fillna("USD")

In [13]:
print(df.isnull().sum())

App Name              5
App Id                0
Category              0
Rating                0
Rating Count          0
Installs            107
Minimum Installs    107
Maximum Installs      0
Free                  0
Price                 0
Currency              0
Size                196
Minimum Android       0
Developer Id         33
Developer Email      31
Released              0
Last Updated          0
Content Rating        0
Ad Supported          0
In App Purchases      0
Editors Choice        0
Scraped Time          0
dtype: int64


In [14]:
df.loc[:, "Developer Id"] = df["Developer Id"].fillna("Unknown Developer Id")
df.loc[:, "Developer Email"] = df["Developer Email"].fillna("Unknown Email")

In [15]:
df.dropna(inplace=True)

In [16]:
print(df.isnull().sum())

App Name            0
App Id              0
Category            0
Rating              0
Rating Count        0
Installs            0
Minimum Installs    0
Maximum Installs    0
Free                0
Price               0
Currency            0
Size                0
Minimum Android     0
Developer Id        0
Developer Email     0
Released            0
Last Updated        0
Content Rating      0
Ad Supported        0
In App Purchases    0
Editors Choice      0
Scraped Time        0
dtype: int64


In [17]:
pip install fasttext

Note: you may need to restart the kernel to use updated packages.


In [18]:
pip install langid

Note: you may need to restart the kernel to use updated packages.


In [None]:
from langdetect import detect
from googletrans import Translator
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

# Initialize translator
translator = Translator()

# Function to detect and translate app names safely
def detect_language(text):
    try:
        if not isinstance(text, str) or len(text.strip()) < 3:  # Ignore empty or very short strings
            return "en"
        return detect(text)
    except:
        return "en"  # Default to English if detection fails

def translate_app_name(app_name):
    try:
        detected_lang = detect_language(app_name)  # Detect language safely
        if detected_lang == "en":
            return app_name  # Already English
        translated = translator.translate(app_name, src=detected_lang, dest="en")
        return translated.text
    except:
        return app_name  # Return original if error occurs

# Detect non-English app names first
df["Detected_Lang"] = df["App Name"].astype(str).apply(detect_language)

# Filter only non-English app names to translate
non_english_apps = df[df["Detected_Lang"] != "en"].copy()

# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=10) as executor:
    non_english_apps["App Name Translated"] = list(executor.map(translate_app_name, non_english_apps["App Name"]))

# Merge translations back into original dataframe
df = df.merge(non_english_apps[["App Name", "App Name Translated"]], on="App Name", how="left")

# Fill missing values where translation was not needed
df["App Name Translated"].fillna(df["App Name"], inplace=True)

# Drop the temporary language detection column
df.drop(columns=["Detected_Lang"], inplace=True)

print(df[["App Name", "App Name Translated"]].head())
