In [1]:
import pandas as pd
import numpy as np
import tldextract
import dateparser
from cleanco import prepare_terms, basename
import unidecode
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse

In [2]:
df = pd.read_csv("../data/apps_matching_80%.csv", low_memory=False, lineterminator='\n')
df = df.drop(df[df["description"].isnull()].index)
df.shape

(155330, 33)

In [3]:
class Preprocessing:
    def __init__(self, df, isTrain):
        self.df_apps_match = df
        self.df_after_preprocessing = pd.DataFrame()
        self.isTrain = isTrain

    def pipeline(self):
        self.add_non_processed()
        self.preprocessing_maincategory()
        self.preprocessing_titles()
        self.preprocessing_author()
        self.preprocessing_devsite()
        self.preprocessing_description()
        # self.preprocessing_releasedate()

        if self.isTrain:
            self.train_test_split()
            self.create_false_data()
            self.save_csvs()
            return
        


        print('pipeline done')

    def add_non_processed(self):
        print('add_non_processed')

        self.df_after_preprocessing["id"] = self.df_apps_match["id"]
        self.df_after_preprocessing["store"] = self.df_apps_match["store"]

        if self.isTrain:
            self.df_after_preprocessing["id_matched"] = self.df_apps_match["id_matched"]

    def preprocessing_maincategory(self):
        print('preprocessing_maincategory')
        
        maincategory = pd.read_json('maincategory.json')
        # Change from apple catagories ids to string catagories

        self.df_after_preprocessing["apple_maincategory"] = (
            self.df_apps_match[self.df_apps_match["store"] == 1]
            .loc[:, "maincategory"]
            .replace(
                maincategory['apple']['numbered'],
                maincategory['apple']['labeled'],
            )
        )

        # Change from google play catagories to apple catagories
        self.df_after_preprocessing["google_maincategory"] = (
            self.df_apps_match[self.df_apps_match["store"] == 0]
            .loc[:, "maincategory"]
            .replace(
                maincategory['google']['upper'],
                maincategory['google']['lower'],
            )
        )

    def preprocessing_titles(self):
        print('preprocessing_titles')

        # lower case the titles and seperate the title
        def create_title(titles):
            # todo: ask davis if need it also for athuor
            return [
                title.lower()
                .strip()
                .partition(":")[0]
                .partition("-")[0]
                .partition(" ")[0]
                for title in titles
            ]

        self.df_after_preprocessing["title"] = create_title(self.df_apps_match["title"])

    def preprocessing_author(self):
        print('preprocessing_author')

        def create_author(authors):
            terms = prepare_terms()
            # Running twice in order to remove multiple endings, i.e Co., Ltd.
            authors = [
                basename(
                    author.lower().strip(), terms, prefix=True, middle=True, suffix=True
                )
                for author in authors
            ]
            authors = [
                basename(
                    author, terms, prefix=True, middle=True, suffix=True
                ).partition(" ")[0]
                for author in authors
            ]
            return authors

        self.df_after_preprocessing["author"] = create_author(self.df_apps_match["author"])
    
    def preprocessing_devsite(self):
        print('preprocessing_devsite')

        def create_devsite(devsites):
            return [
                tldextract.extract(devsite.lower().strip()).domain
                for devsite in devsites
            ]

        self.df_after_preprocessing["devsite"] = create_devsite(self.df_apps_match["devsite"].values.astype(str))

    def preprocessing_releasedate(self):
        print('preprocessing_releasedate')

        def parse_date(date):
            if not isinstance(date, str):
                # always nan values
                return

            return dateparser.parse(date)

        self.google_play_df_after_eda["releasedate"] = pd.to_datetime(
            self.google_play_df["releasedate"].apply(parse_date), errors="coerce"
        )
        # self.google_play_df['releasedate'].apply(parse_date).values.astype('datetime64[D]')
        self.app_store_df_after_eda["releasedate"] = pd.to_datetime(
            self.app_store_df["releasedate"].apply(parse_date), errors="coerce"
        )

    def preprocessing_description(self):  # todo: make it better..
        print('preprocessing_description')

        def create_descriptions(descriptions):
            return [
                unidecode.unidecode(re.sub(r"\d+", "", description))
                .lower()
                .translate(str.maketrans("", "", string.punctuation))
                .strip()
                for description in descriptions
            ]

        def save_tfidf_embeddings(documents):
            vectorizer = TfidfVectorizer()
            embeddings = vectorizer.fit_transform(documents)
            scipy.sparse.save_npz('../data/tfidf/embeddings.npz', embeddings)
            

        self.df_after_preprocessing["description"] = create_descriptions(self.df_apps_match["description"])
        save_tfidf_embeddings(self.df_after_preprocessing["description"])

    def train_test_split(self):
        print('train_test_split')
        
        # Shuffle dataset 
        shuffle_df = self.df_after_preprocessing.sample(frac=1)

        # get 10% of data
        test_size = int(0.1 * len(self.df_after_preprocessing))

        test_set_first_part = shuffle_df[:test_size]
        test_set_second_part = self.df_after_preprocessing[self.df_after_preprocessing["id_matched"].isin(test_set_first_part["id"])]

        self.test_data = pd.concat([test_set_first_part, test_set_second_part])
        self.train_data = self.df_after_preprocessing[~self.df_after_preprocessing["id"].isin(self.test_data["id"])]

        self.google_play_test_data = self.test_data[self.test_data["store"] == 0].rename(columns={'google_maincategory': 'maincategory'}).drop(columns=['store', 'apple_maincategory']).reset_index(drop=True)
        
        self.google_play_train_data = self.train_data[self.train_data["store"] == 0].rename(columns={'google_maincategory': 'maincategory'}).drop(columns=['store', 'apple_maincategory']).reset_index(drop=True)

        self.app_store_test_data = self.test_data[self.test_data["store"] == 1].rename(columns={'apple_maincategory': 'maincategory'}).drop(columns=['store', 'google_maincategory']).reset_index(drop=True)

        self.app_store_train_data = self.train_data[self.train_data["store"] == 1].rename(columns={'apple_maincategory': 'maincategory'}).drop(columns=['store', 'google_maincategory']).reset_index(drop=True)

        self.matched_test_data = self.test_data.merge(self.test_data, how="inner", left_on="id", right_on="id_matched").reset_index(drop=True)
        self.matched_train_data = self.train_data.merge(self.train_data, how="inner", left_on="id", right_on="id_matched").reset_index(drop=True)

        # remove duplicate matches 
        mask_test = self.matched_test_data[self.matched_test_data["store_x"] == 1].index
        self.matched_test_data.drop(mask_test, inplace=True)

        mask_train = self.matched_train_data[self.matched_train_data["store_x"] == 1].index
        self.matched_train_data.drop(mask_train, inplace=True)

        # remove unmatched apps
        # TODO: check how come we have unmatched apps
        self.matched_test_data = self.matched_test_data.dropna(subset=["id_y"])
        self.matched_train_data = self.matched_train_data.dropna(subset=["id_y"])


    def create_false_data(self):
        print('create_false_data')

        def get_false_data(apple_train_data, google_train_data):
            num_of_matches = len(apple_train_data) if len(apple_train_data) % 2 == 0 else len(apple_train_data) - 1 # keeping it even
            sample_size = int(num_of_matches * 5)

            google_rand_indexes = np.random.randint(num_of_matches, size=int(sample_size / 2))
            apple_rand_indexes = np.random.randint(num_of_matches, size=int(sample_size / 2))

            self.google_random_rows = google_train_data.iloc[google_rand_indexes]
            self.apple_random_rows = apple_train_data.iloc[apple_rand_indexes]
            
            return pd.concat([preprocessing.google_random_rows.reset_index(drop=True).add_suffix("_x"), preprocessing.apple_random_rows.reset_index(drop=True).add_suffix("_y")], axis=1).reset_index(drop=True)

        self.false_train_data = get_false_data(self.app_store_train_data, self.google_play_train_data)
        self.false_test_data = get_false_data(self.app_store_test_data, self.google_play_test_data)

    def save_csvs(self):
        print('save_csvs')

        self.matched_test_data.to_csv(
            "../data/preprocessed/matched_test_data.csv", index=False, header=True
        )
        self.matched_train_data.to_csv(
            "../data/preprocessed/matched_train_data.csv", index=False, header=True
        )

        self.google_play_test_data.to_csv(
            "../data/preprocessed/google_play_test_data.csv", index=False, header=True
        )
        self.google_play_train_data.to_csv(
            "../data/preprocessed/google_play_train_data.csv", index=False, header=True
        )

        self.app_store_test_data.to_csv(
            "../data/preprocessed/app_store_test_data.csv", index=False, header=True
        )
        self.app_store_train_data.to_csv(
            "../data/preprocessed/app_store_train_data.csv", index=False, header=True
        )

        self.false_train_data.to_csv(
            "../data/preprocessed/false_train_data.csv", index=False, header=True
        )
        self.false_test_data.to_csv(
            "../data/preprocessed/false_test_data.csv", index=False, header=True
        )

In [4]:
%%time

preprocessing = Preprocessing(df, True)
preprocessing.pipeline()

add_non_processed
preprocessing_maincategory
preprocessing_titles
preprocessing_author


In [None]:
"""
select * from app_info where store = '0' and title is not null and title <> '' and author is not null and author <> '' and description is not null and description <> '' and devsite is not null and devsite <> '' limit 1000;
"""

In [None]:
apple_1k = pd.read_csv("../data/1k_apple.csv", low_memory=False, lineterminator='\n')
android_1k = pd.read_csv("../data/1k_android.csv", low_memory=False, lineterminator='\n')

In [None]:
%%time

preprocessing_apple = Preprocessing(apple_1k, False)
preprocessing_apple.pipeline()

In [None]:
%%time

preprocessing_android = Preprocessing(android_1k, False)
preprocessing_android.pipeline()

In [None]:
android_1k_processes = preprocessing_android.df_after_preprocessing
apple_1k_processes = preprocessing_apple.df_after_preprocessing
crossed_all_data = android_1k_processes.merge(apple_1k_processes, how="cross")

crossed_all_data.to_csv("../data/preprocessed/crossed_all_data.csv", index=False, header=True)

In [None]:
crossed_all_data