In [1]:
!pip install google_play_scraper

Collecting google_play_scraper
  Downloading google_play_scraper-1.2.4-py3-none-any.whl (28 kB)
Installing collected packages: google_play_scraper
Successfully installed google_play_scraper-1.2.4


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.3-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
Ins

In [5]:
import os
import pandas as pd
import subprocess
import json
from google_play_scraper import app
import time
import concurrent.futures
# from datasets import load_dataset
from transformers import AutoTokenizer
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification

In [29]:
class AndrozooProcessor:
    def __init__(self, config_path):
        with open(config_path, 'r') as config_file:
            self.config = json.load(config_file)
        self.app_id_list = None
        self.dataset_df = None

    def __init__(self):
        self.config = None
        self.app_id_list = None
        self.dataset_df = None

    def build_dataset(self):
        apps_not_found = []
        apps_added_count = 0
        apps_not_found_count = 0
        total_apps = len(self.app_id_list)

        df = pd.DataFrame(columns=['app_id', 'description'])
        print(f'Adding apps: {len(self.app_id_list)}')

        def fetch_app_details(app_name):
            nonlocal apps_added_count, apps_not_found_count
            # if apps_added_count >= 100000:
            #     return None
            try:
                result_app_details = app(
                    app_name,
                    lang='en',
                    country='us'
                )
            except:
                apps_not_found.append(app_name)
                apps_not_found_count += 1
                return None

            apps_added_count += 1
            if apps_added_count % 1000 == 0:
                print(f'{apps_added_count}/{total_apps} apps processed ({apps_added_count / total_apps * 100}% complete)')

            description = result_app_details['description']
            url = result_app_details['url']

            return [app_name, description]

        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            results = list(executor.map(fetch_app_details, self.app_id_list))

        for result in results:
            if result is not None:
                df.loc[len(df.index)] = result



        self.dataset_df = df
        print(self.dataset_df)
        self.dataset_df.to_csv('apps_df.csv')
    def parse_and_clean(self):
        # input_file_path = self.config.get('input_file_path')

        # # Parse androzoo dataset for potential apps
        # command = (
        #     f"cat {input_file_path} | grep -v ',snaggamea' | "
        #     "awk -F, 'BEGIN {{print \"app_name,date\"}} {{if ($11 ~ /play\.google\.com/ && $4 > \"2020-11-1\") print \"\\\"\" $6 \"\\\",\" $4}}' | "
        #     "head -n 500000 > androzoo.csv"
        # )

        # subprocess.call(command, shell=True)

        # Clean the generated CSV by removing "" from the beginning of app_names
        df = pd.read_csv("/content/filtered_androzoo_csv_file.csv")
        print(df)
        # df["app_name"] = df["app_name"].str.strip('"')

        # Save to csv for later reference
        # df.to_csv("androzoo.csv", index=False)

        app_id_list = df['app_name'].tolist()
        self.app_id_list = app_id_list[:200000]
        print('len',len(self.app_id_list))
        return app_id_list

    def parse_apps(self):
        self.mhealth_apps = []
        print('Parsing apps')
        id2label = {0: "NON-MHEALTH", 1: "MHEALTH"}

        # Check if GPU is available
        if torch.cuda.is_available():
            device = 'cuda'
            print('Using GPU.')
        else:
            device = 'cpu'
            print('GPU not available. Using CPU.')

        tokenizer = AutoTokenizer.from_pretrained('etham13/MHealth_app_classifier')

        model = AutoModelForSequenceClassification.from_pretrained('etham13/MHealth_app_classifier')
        model.to(device)  # Move the model to GPU if available

        # Iteration of generated dataset of apps and descriptions
        for index, row in self.dataset_df.iterrows():
            if 'description' in row and pd.notna(row['description']):
                # print(row)
                encoding = tokenizer(row['description'], return_tensors="pt", max_length=512, truncation=True, padding=True)
                encoding = {k: v.to(device) for k, v in encoding.items()}

                outputs = model(**encoding)
                predictions = outputs.logits.squeeze().cpu()

                sigmoid = torch.nn.Sigmoid()
                probs = sigmoid(torch.Tensor(predictions))
                predictions[np.where(probs >= 0.5)] = 1
                predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
                # print(predicted_labels[0])
                if predicted_labels and predicted_labels[0] == 'MHEALTH':
                    # print(predicted_labels)
                    self.mhealth_apps.append(row['app_id'])

                    if len(self.mhealth_apps) > 1 and len(self.mhealth_apps) % 100 == 0:
                        print(f'{len(self.mhealth_apps)} / {index} mhealth apps processed ({len(self.mhealth_apps) / index * 100:.3}% of proccessed apps)')
                        print(f'{index / len(self.dataset_df)* 100:.3}% complete')


In [22]:
# processor = AndrozooProcessor()
# app_id_list = processor.parse_and_clean()
# processor.build_dataset()

In [30]:
processor = AndrozooProcessor()
processor.dataset_df = pd.read_csv('/content/apps_df.csv')
processor.dataset_df = processor.dataset_df.drop(columns=['Unnamed: 0'])
processor.dataset_df

Unnamed: 0,app_id,description
0,us.textr.Anonytext,"<b>Send video, picture, audio and text messagi..."
1,com.minimalist.bricks,Get ready for the ultimate test of your reflex...
2,com.doko.android,Doko c’est quoi ?\r\n\r\nDoko du japonais part...
3,com.fitivity.shadowboxing,Fitivity gets you better. Looks like <b>you’re...
4,com.liderapp,"LíderApp is an application, of the Huellas You..."
...,...,...
102543,com.blurams.ipc,The blurams app is home monitoring Wi-Fi video...
102544,com.dinogo.catarmy,Your kingdom are invaded by the monsters. Grow...
102545,com.solitaire.daily.challenge.card.game,"★<b>Do you like playing Classic Solitaire, Klo..."
102546,com.rodeodigital.talhamas,"Talha stores is a variety center, it deals wit..."


In [None]:
processor.parse_apps()

Parsing apps
Using GPU.
100 / 1672 mhealth apps processed (5.98% of proccessed apps)
1.63% complete
200 / 4111 mhealth apps processed (4.86% of proccessed apps)
4.01% complete
300 / 6610 mhealth apps processed (4.54% of proccessed apps)
6.45% complete
400 / 8578 mhealth apps processed (4.66% of proccessed apps)
8.36% complete
500 / 10186 mhealth apps processed (4.91% of proccessed apps)
9.93% complete
600 / 12095 mhealth apps processed (4.96% of proccessed apps)
11.8% complete
700 / 14189 mhealth apps processed (4.93% of proccessed apps)
13.8% complete
800 / 16500 mhealth apps processed (4.85% of proccessed apps)
16.1% complete
900 / 18554 mhealth apps processed (4.85% of proccessed apps)
18.1% complete
1000 / 20532 mhealth apps processed (4.87% of proccessed apps)
20.0% complete
1100 / 22750 mhealth apps processed (4.84% of proccessed apps)
22.2% complete
1200 / 24308 mhealth apps processed (4.94% of proccessed apps)
23.7% complete
1300 / 26259 mhealth apps processed (4.95% of procces

In [None]:
len(processor.mhealth_apps)

2209

In [None]:
with open('apps_k.txt', 'w') as file:
            for app_id in processor.mhealth_apps:
                file.write(f"{app_id}\n")

In [None]:
processor.dataset_df

Unnamed: 0,app_id,description
0,air.theflash.f2game.prettygirl66,Cute pretty girl to Cinderella Style\r\nDecora...
1,br.com.toquefacil,Enough of the paper folders! Use the same reso...
2,com.wemademax.riseofstars,"MINE, CONQUER, EARN\r\nParticipate in the reso..."
3,org.example.CalculatorSimple,Calculator is a powerful and smartest Android ...
4,com.yemenmazad.www,Find jobs. Hire employees. Post your resume. P...
...,...,...
45614,fr.sospets.app,FREE application for DAILY and EMERGENCY MEDIA...
45615,co.nick.sfvjt,Chemisyry Classes Jaipur - If you're a chemist...
45616,com.asteracu.appstore.cardsvcs,Enjoy easy and on-the-go management of your cr...
45617,com.umerapps.paracollection,Quran Para 1 to 30 - Urdu Book is an app that ...


In [None]:
def read_app_ids(file_path):
    with open(file_path, 'r') as file:
        app_ids = file.read().splitlines()
    return set(app_ids)

def write_app_ids(file_path, app_ids):
    with open(file_path, 'w') as file:
        file.write('\n'.join(app_ids))

# File paths for the two text files
file_path_2 = '/content/apps_k.txt'
file_path_1 = '/content/apps_6k.txt'

# Read app IDs from both files
app_ids_set_1 = read_app_ids(file_path_1)
app_ids_set_2 = read_app_ids(file_path_2)
print(len(app_ids_set_2))

# Find common app IDs (duplicates)
common_app_ids = app_ids_set_1.intersection(app_ids_set_2)

# Remove duplicates from the 'apps.txt' file
all_app_ids = read_app_ids(file_path_2)
unique_app_ids = all_app_ids - common_app_ids
print(len(unique_app_ids))
# Write the unique app IDs back to the 'apps.txt' file
write_app_ids(file_path_2, unique_app_ids)

print("Duplicates removed from 'apps.txt'")


2209
2172
Duplicates removed from 'apps.txt'
