In [95]:
import pandas as pd
import numpy as np
import tldextract
import dateparser
from cleanco import prepare_terms, basename
import unidecode
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse

In [96]:
df = pd.read_csv("../data/apps_matching_80%.csv", low_memory=False, lineterminator='\n')
df.head()

Unnamed: 0,store,id,title,author,agerating,coverimage,description,devsite,devmail,address,...,screenshots,localdescs,similarapps,os,lastseen,ratinghistogram,currentrating,currentratingcount,valid,id_matched
0,0,com.emotorwerks.juicebox,EV JuiceNet,Enel X North America,Everyone,https://play-lh.googleusercontent.com/BuDGhoJQ...,Main features:\n- Monitor & control smart char...,https://evcharging.enelx.com/,android@emotorwerks.com,,...,[https://play-lh.googleusercontent.com/1P9J4JK...,[Main features:\n- Monitor & control smart cha...,"[at.bluesource.summitlynx.activities, at.codep...",4.0 and up,1611879000000.0,"[80.0, 36.0, 49.0, 55.0, 55.0]",0.0,0.0,True,1106989154
1,0,de.westermann.StarteMit4Zeit,Starte mit! Zeit,Westermann Digital GmbH,Everyone,https://play-lh.googleusercontent.com/Ufircb3J...,Die DaZ-App „Starte mit! Zeit“ liest alle im A...,http://verlage.westermanngruppe.de,apps@westermanngruppe.de,Georg-Westermann-Allee 66 38104 Braunschweig,...,[https://play-lh.googleusercontent.com/vyWs-hG...,[Die DaZ-App „Starte mit! Zeit“ liest alle im ...,"[com.ztoapps.tafelsoefenen, com.kvartgroup.ani...",4.1 and up,1611879000000.0,"[-1.0, -1.0, -1.0, -1.0, -1.0]",0.0,0.0,True,1385454157
2,0,com.subsplash.thechurchapp.s_XSSHFV,Somebody Loves You,Subsplash Inc,Teen,https://play-lh.googleusercontent.com/fL2DeiiK...,The official Somebody Loves You app features v...,http://www.subsplash.com/,appsupport@subsplash.com,"3257 16th Ave W #200, Seattle, WA 98119",...,[https://play-lh.googleusercontent.com/WIuPmw5...,[The official Somebody Loves You app features ...,"[uk.adolo.joycemeyer, com.subsplash.thechurcha...",6.0 and up,1611879000000.0,"[4.0, 1.0, 1.0, 2.0, 131.0]",0.0,0.0,True,904460406
3,0,com.tinder,"Tinder - Dating, Make Friends and Meet New People",Tinder,Mature 17+,https://play-lh.googleusercontent.com/fDpoqIbZ...,"🔥You can be here for a good time, and a long t...",http://www.gotinder.com,help@gotinder.com,"Tinder 8833 W. Sunset Blvd. West Hollywood, CA...",...,[https://play-lh.googleusercontent.com/Tujxh7B...,"[🔥You can be here for a good time, and a long ...","[mingle.android.mingle2, com.azarlive.android,...",6.0 and up,1612225000000.0,"[1290073.0, 186145.0, 267487.0, 531097.0, 1886...",0.0,0.0,True,547702041
4,0,br.com.icatuseguros.target,Target,Icatu Seguros,Everyone,https://play-lh.googleusercontent.com/LEB6pkQi...,O Target é um simulador que vai ajudar você a ...,http://www.icatuseguros.com.br,icatuseguros@gmail.com,,...,[https://play-lh.googleusercontent.com/VhaKMUg...,[O Target é um simulador que vai ajudar você a...,"[com.herzick.houseparty, com.snapchat.android,...",Varies with device,1513182000000.0,"[8.0, 2.0, 4.0, 4.0, 31.0]",0.0,0.0,True,662285505


In [97]:
 df = df.drop(df[df["description"].isnull()].index)
 df.shape

(155156, 33)

In [100]:
class Preprocessing:
    def __init__(self, df):
        self.df_apps_match = df
        self.df_after_preprocessing = pd.DataFrame()

    def pipeline(self):
        self.add_non_processed()
        self.preprocessing_maincategory()
        self.preprocessing_titles()
        self.preprocessing_author()
        self.preprocessing_devsite()
        self.preprocessing_description()
        # self.preprocessing_releasedate()
        self.train_test_split()
        self.create_false_data()

        self.save_csvs()
        print('pipeline done')

    def add_non_processed(self):
        print('add_non_processed')

        self.df_after_preprocessing["id"] = self.df_apps_match["id"]
        self.df_after_preprocessing["store"] = self.df_apps_match["store"]
        self.df_after_preprocessing["id_matched"] = self.df_apps_match["id_matched"]

    def preprocessing_maincategory(self):
        print('preprocessing_maincategory')
        
        maincategory = pd.read_json('maincategory.json')
        # Change from apple catagories ids to string catagories

        self.df_after_preprocessing["apple_maincategory"] = (
            self.df_apps_match[self.df_apps_match["store"] == 1]
            .loc[:, "maincategory"]
            .replace(
                maincategory['apple']['numbered'],
                maincategory['apple']['labeled'],
            )
        )

        # Change from google play catagories to apple catagories
        self.df_after_preprocessing["google_maincategory"] = (
            self.df_apps_match[self.df_apps_match["store"] == 0]
            .loc[:, "maincategory"]
            .replace(
                maincategory['google']['upper'],
                maincategory['google']['lower'],
            )
        )

    def preprocessing_titles(self):
        print('preprocessing_titles')

        # lower case the titles and seperate the title
        def create_title(titles):
            # todo: ask davis if need it also for athuor
            return [
                title.lower()
                .strip()
                .partition(":")[0]
                .partition("-")[0]
                .partition(" ")[0]
                for title in titles
            ]

        self.df_after_preprocessing["title"] = create_title(self.df_apps_match["title"])

    def preprocessing_author(self):
        print('preprocessing_author')

        def create_author(authors):
            terms = prepare_terms()
            # Running twice in order to remove multiple endings, i.e Co., Ltd.
            authors = [
                basename(
                    author.lower().strip(), terms, prefix=True, middle=True, suffix=True
                )
                for author in authors
            ]
            authors = [
                basename(
                    author, terms, prefix=True, middle=True, suffix=True
                ).partition(" ")[0]
                for author in authors
            ]
            return authors

        self.df_after_preprocessing["author"] = create_author(self.df_apps_match["author"])
    
    def preprocessing_devsite(self):
        print('preprocessing_devsite')

        def create_devsite(devsites):
            return [
                tldextract.extract(devsite.lower().strip()).domain
                for devsite in devsites
            ]

        self.df_after_preprocessing["devsite"] = create_devsite(self.df_apps_match["devsite"].values.astype(str))

    def preprocessing_releasedate(self):
        print('preprocessing_releasedate')

        def parse_date(date):
            if not isinstance(date, str):
                # always nan values
                return

            return dateparser.parse(date)

        self.google_play_df_after_eda["releasedate"] = pd.to_datetime(
            self.google_play_df["releasedate"].apply(parse_date), errors="coerce"
        )
        # self.google_play_df['releasedate'].apply(parse_date).values.astype('datetime64[D]')
        self.app_store_df_after_eda["releasedate"] = pd.to_datetime(
            self.app_store_df["releasedate"].apply(parse_date), errors="coerce"
        )

    def preprocessing_description(self):  # todo: make it better..
        print('preprocessing_description')

        def create_descriptions(descriptions):
            return [
                unidecode.unidecode(re.sub(r"\d+", "", description))
                .lower()
                .translate(str.maketrans("", "", string.punctuation))
                .strip()
                for description in descriptions
            ]

        def save_tfidf_embeddings(documents):
            vectorizer = TfidfVectorizer()
            embeddings = vectorizer.fit_transform(documents)
            scipy.sparse.save_npz('../data/tfidf/embeddings.npz', embeddings)
            

        self.df_after_preprocessing["description"] = create_descriptions(self.df_apps_match["description"])
        save_tfidf_embeddings(self.df_after_preprocessing["description"])

    def train_test_split(self):
        print('train_test_split')
        
        # Shuffle dataset 
        shuffle_df = self.df_after_preprocessing.sample(frac=1)

        # get 10% of data
        test_size = int(0.1 * len(self.df_after_preprocessing))

        test_set_first_part = shuffle_df[:test_size]
        test_set_second_part = self.df_after_preprocessing[self.df_after_preprocessing["id_matched"].isin(test_set_first_part["id"])]

        self.test_data = pd.concat([test_set_first_part, test_set_second_part])
        self.train_data = self.df_after_preprocessing[~self.df_after_preprocessing["id"].isin(self.test_data["id"])]

        self.google_play_test_data = self.test_data[self.test_data["store"] == 0].rename(columns={'google_maincategory': 'maincategory'}).drop(columns=['store', 'apple_maincategory']).reset_index(drop=True)
        
        self.google_play_train_data = self.train_data[self.train_data["store"] == 0].rename(columns={'google_maincategory': 'maincategory'}).drop(columns=['store', 'apple_maincategory']).reset_index(drop=True)

        self.app_store_test_data = self.test_data[self.test_data["store"] == 1].rename(columns={'apple_maincategory': 'maincategory'}).drop(columns=['store', 'google_maincategory']).reset_index(drop=True)

        self.app_store_train_data = self.train_data[self.train_data["store"] == 1].rename(columns={'apple_maincategory': 'maincategory'}).drop(columns=['store', 'google_maincategory']).reset_index(drop=True)

        self.matched_test_data = self.test_data.merge(self.test_data, how="inner", left_on="id", right_on="id_matched").reset_index(drop=True)
        self.matched_train_data = self.train_data.merge(self.train_data, how="inner", left_on="id", right_on="id_matched").reset_index(drop=True)

        # remove duplicate matches 
        mask_test = self.matched_test_data[self.matched_test_data["store_x"] == 1].index
        self.matched_test_data.drop(mask_test, inplace=True)

        mask_train = self.matched_train_data[self.matched_train_data["store_x"] == 1].index
        self.matched_train_data.drop(mask_train, inplace=True)

        # remove unmatched apps
        # TODO: check how come we have unmatched apps
        self.matched_test_data = self.matched_test_data.dropna(subset=["id_y"])
        self.matched_train_data = self.matched_train_data.dropna(subset=["id_y"])


    def create_false_data(self):
        print('create_false_data')

        def get_false_data(apple_train_data, google_train_data):
            num_of_matches = len(apple_train_data) if len(apple_train_data) % 2 == 0 else len(apple_train_data) - 1 # keeping it even
            sample_size = int(num_of_matches * 5)

            google_rand_indexes = np.random.randint(num_of_matches, size=int(sample_size / 2))
            apple_rand_indexes = np.random.randint(num_of_matches, size=int(sample_size / 2))

            self.google_random_rows = google_train_data.iloc[google_rand_indexes]
            self.apple_random_rows = apple_train_data.iloc[apple_rand_indexes]
            
            return pd.concat([preprocessing.google_random_rows.reset_index(drop=True).add_suffix("_x"), preprocessing.apple_random_rows.reset_index(drop=True).add_suffix("_y")], axis=1).reset_index(drop=True)

        self.false_train_data = get_false_data(self.app_store_train_data, self.google_play_train_data)
        self.false_test_data = get_false_data(self.app_store_test_data, self.google_play_test_data)

    def save_csvs(self):
        print('save_csvs')

        self.matched_test_data.to_csv(
            "../data/preprocessed/matched_test_data.csv", index=False, header=True
        )
        self.matched_train_data.to_csv(
            "../data/preprocessed/matched_train_data.csv", index=False, header=True
        )

        self.google_play_test_data.to_csv(
            "../data/preprocessed/google_play_test_data.csv", index=False, header=True
        )
        self.google_play_train_data.to_csv(
            "../data/preprocessed/google_play_train_data.csv", index=False, header=True
        )

        self.app_store_test_data.to_csv(
            "../data/preprocessed/app_store_test_data.csv", index=False, header=True
        )
        self.app_store_train_data.to_csv(
            "../data/preprocessed/app_store_train_data.csv", index=False, header=True
        )

        self.false_train_data.to_csv(
            "../data/preprocessed/false_train_data.csv", index=False, header=True
        )
        self.false_test_data.to_csv(
            "../data/preprocessed/false_test_data.csv", index=False, header=True
        )

In [101]:
preprocessing = Preprocessing(df)
preprocessing.pipeline()

add_non_processed
preprocessing_maincategory
preprocessing_titles
preprocessing_author
preprocessing_devsite
preprocessing_description
train_test_split
create_false_data
save_csvs
pipeline done


In [102]:
df = pd.read_csv("../data/preprocessed/false_test_data.csv", low_memory=False, lineterminator='\n')
df.head()

Unnamed: 0,id_x,id_matched_x,maincategory_x,title_x,author_x,devsite_x,description_x,id_y,id_matched_y,maincategory_y,title_y,author_y,devsite_y,description_y
0,com.wildfoot.pony.multiplayer,1215229399,ADVENTURE,pony,wild,wildfootgames,pony multiplayer enter this epic d world as a...,1097158507,com.instantencore.cccband_5151891,Entertainment,coastal,instantencore.com,instantencore,the coastal communities concert band fan app i...
1,com.lemondoo.crackyourscreenfree,448925179,Entertainment,crack,lemondo,lemondo,crack your screen have you ever dreamed to cra...,1042314648,com.dotemu.ihnmaims,Games,i,dotemu,dotemu,i have no mouth and i must scream is a classic...
2,com.wildfoot.pony.multiplayer,1215229399,ADVENTURE,pony,wild,wildfootgames,pony multiplayer enter this epic d world as a...,1434175518,jp.co.medicalprinciple.myikyoku,Medical,my医局,株式会社メディカル・プリンシプル社,doctor-agent,zui xin niyusushao jie \nye jie niyusuyasemina...
3,com.techwin.wisenetmobile.android,1158492654,Photo Video,wisenet,hanwha,hanwha-security,wisenet mobile is a free application designed ...,594635766,com.thirtySouth.BikeTrackerFree,Health and Fitness,bike,30,30-south,ride tracker is a gps based tracking app for p...
4,com.tgoma.trampolineapp,1069247173,Games,tgoma,tgoma,tgoma,tgomar turns your springfree trampoline into t...,784387893,se.mittmedia.emaginapp.sundsvallstidning.Sunds...,News,sundsvalls,mittmedia,st,sundsvalls tidning\nfran mittmedia ab\n\nlas n...
