# Scrapy Scraper

Below is our web scraper on Google Play built with the help of the Scrapy library. It works by first visting pages containing free top-selling apps for different categories and then extract their urls. Then it visits all app pages and extract the app name and description. When it is done it stores the data on file.

In [None]:
import scrapy
import json
import random
import warnings
warnings.filterwarnings('ignore')

from time import sleep
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor

class GooglePlaySpider(scrapy.Spider):
    name = 'googleplay_bot'
    allowed_domains = ['play.google.com']

    def __init__(self):
        super().__init__()

        self.app_name_desc = {}

        self.base_url = "https://play.google.com"
        self.base_category_url = self.base_url + "/store/apps/category/%s/collection/topselling_free"
        self.english_language_specifier_site = "?hl=en"
        self.english_language_specifier_app = "&hl=en"

        self.categories = ["EDUCATION", "MEDICAL", "DATING", "COMMUNICATION",
                           "FINANCE", "HEALTH_AND_FITNESS",
                           "MUSIC_AND_AUDIO", "PERSONALIZATION", "ENTERTAINMENT",
                           "EVENTS", "COMICS", "BEAUTY", "LIFESTYLE", "MAPS_AND_NAVIGATION",
                           "HOUSE_AND_HOME", "BUSINESS", "BOOKS_AND_REFERENCES", "ART_AND_DESIGN",
                           "FOOD_AND_DRINK", "ANDROID_WEAR", "GAME_ACTION", "GAME_ADVENTURE"]

        self.apps_visited = set()
        self.max_apps = 1500
        self.app_xpath = "//a[@class='card-click-target']/@href"

        self.app_name_xpath = "//div[@class='id-app-title']/text()"
        self.app_desc_xpath = "//div[@itemprop='description']/div/text()"

        dispatcher.connect(self.spider_closed, signals.spider_closed)


    def start_requests(self):
        print("Start Requests")

        for category in self.categories:
            url = self.base_category_url % category + self.english_language_specifier_site
            yield scrapy.Request(url=url, callback=self.parse)


    def parse(self, response):
        print("Parsing", response.url)

        app_urls = set(response.xpath(self.app_xpath).extract())

        for app_url in app_urls:
            url = self.base_url + app_url + self.english_language_specifier_app

            if url not in self.apps_visited and len(self.apps_visited) < self.max_apps:
                self.apps_visited.add(url)
                yield scrapy.Request(url=url, callback=self.parse_app)

    def parse_app(self, response):
        print("Parsing app", response.url)

        name = " ".join(response.xpath(self.app_name_xpath).extract())
        desc = " ".join(response.xpath(self.app_desc_xpath).extract())

        self.app_name_desc[name] = desc

    def spider_closed(self, spider):
        with open("app_desc.json", "w", encoding = "utf-8") as file:
            json.dump(self.app_name_desc, file)


def main():
    runner = CrawlerRunner()
    d = runner.crawl(GooglePlaySpider)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()

# Pre-processing

We built some helper functions to handle the pre-processing of the text for us. The pipeline follows the on proposed in the slides.

1. Tokenization
2. Removal of non-alphanumeric characters
3. Lowercase
4. Stopword removal
5. Stemming

Since the crawler saved the result on file, we just read the content and pre-process it, and again save it on file.

In [19]:
import nltk
# nltk.download("punkt")
# nltk.download("stopwords")

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

def tf_tokenize(text):
    return nltk.word_tokenize(text)

def rm_nonalphanum(words):
    return [''.join(ch for ch in word if ch.isalnum()) for word in words]

def tf_lowercase(words):
    return [word.lower() for word in words]

def rm_stopwords(words):
    stops = set(stopwords.words('english'))
    return [word for word in words if word not in stops]

def tf_stem(words):
    stemmer  = SnowballStemmer("english")
    return [stemmer.stem(word) for word in words]

def tf_join(words):
    return ' '.join(words)

def tf_process(text):
    function_list =[tf_tokenize, rm_nonalphanum, tf_lowercase,
                    rm_stopwords, tf_stem, tf_join]

    for function in function_list:
        text = function(text)

    return text

In [20]:
import json

with open('app_desc.json', 'r', encoding="utf-8") as infile:
    data = json.load(infile)

processed_text = {}

for name, desc in data.items():
    processed_text[name] = tf_process(desc)


with open("proc_app_desc.json", "w", encoding="utf-8") as outfile:
    json.dump(processed_text, outfile)

# Recommendation Engine

When we have the data processed we can create our recommendation engine. We use the term-document inverse-document-frequency (tfidf) representation of all our data which is provided by scikit-learn package. So we read the processed data and train our model (TfidfVectorizer) which gives us a matrix representation of our data. The engine is then mostly finished, all we need to do is give it a query that gets processed the same way the data was originally processed and then transform it to a tfidf-vector using our trained model. Then we use the cosine similarity metric to find the n-most similar applications, where n = 10 in this case. 

In [52]:
import json
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

with open('proc_app_desc.json', 'r', encoding="utf-8") as infile:
    data = json.load(infile)

descriptions = list(data.values())
app_names = list(data.keys())

vectorizer = TfidfVectorizer()
tfidf_desc_mat = vectorizer.fit_transform(descriptions)

def get_recommendations(query, n=10):
    query = [tf_process(query)]
    tfidf_query_vec = vectorizer.transform(query)

    search_result = cosine_similarity(tfidf_query_vec, tfidf_desc_mat).flatten().tolist()
    
    print(search_result)
    index = np.argsort(search_result, axis = 0)[::-1][:n]
    
    print(index.shape)
    print(index)
    recommendations = [app_names[i] for i in set(index)]

    return recommendations


print(get_recommendations("photo image social"))
print(get_recommendations("game video fun"))
print(get_recommendations("money debt loan"))


[0.0, 0.0, 0.02670046114362914, 0.01835433263871913, 0.0, 0.02048745236758785, 0.13027257439574969, 0.0, 0.013260309920862495, 0.12288817256375113, 0.021618653999475008, 0.0, 0.0, 0.013140821547827706, 0.0, 0.0, 0.028816674432152904, 0.0, 0.018709252670172163, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04466895678665255, 0.0, 0.06650040061152249, 0.0, 0.0, 0.0, 0.0, 0.028865072149724634, 0.0, 0.021209784112144255, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1270745888702049, 0.010653065059490823, 0.0, 0.0, 0.0, 0.015719096706941733, 0.0, 0.0, 0.031600100320292, 0.0, 0.0, 0.0, 0.03754460308466494, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.018261342509611492, 0.0, 0.0, 0.0, 0.017169139596285153, 0.01878938552178481, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0405433909647822, 0.0, 0.05151081838614496, 0.0, 0.0, 0.0, 0.0, 0.03409435968201292, 0.0, 0.0, 0.012654850249840826, 0.0, 0.0, 0.08362648595449755, 0.0, 0.056883449064401466, 0.0634305779911741, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04532

In [None]:
We can see that the recommendations seem appropriate. 