<a href="https://colab.research.google.com/github/Branden-Kang/Web-Scraper/blob/master/How_I_build_my_first_python_web_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[Reference](https://carloalbertocarrucciu.medium.com/how-i-build-my-first-python-web-scraper-for-infinite-scrolling-web-pages-52d744c7873c)

In [1]:
import urllib.request
import json 

categories = {
    "other":"32",
    "bd-illustration":"33",
    "movies":"34",
    "food":"35",
    "geek":"36",
    "video-game":"38",
    "humour":"37",
    "journalism":"39",
    "books":"40",
    "fashion":"41",
    "music":"42",
    "photography":"43",
    "science-technology":"44",
    "performing-arts":"45",
    "sports":"46",
    "vlog":"47",
    "streaming":"52"
}

superheaders = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
               'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
               'Accept-Encoding': 'none',
               'Accept-Language': 'en-US,en;q=0.8',
               'Connection': 'keep-alive'}

def show_categories(categories = categories):
    """
    Show list of admissible values for category 
    """
    for category in categories.keys():
        print(category)
        
# TO REQUEST ONE SINGLE JSON WEB PAGE
def requesting(url, headers = superheaders):
    req = urllib.request.Request(url,headers=headers)
    response= urllib.request.urlopen(req)
    data = response.read()
    encoding = response.info().get_content_charset('utf-8')
    data = json.loads(data.decode(encoding))
    return data

In [2]:
class Creators:
    
    def __init__(self, lang='en'):
        self.lang = lang
        self.scraped =   list()
        self.creators = list()
       
        
    def __iter__(self):
        for elem in self.creators: 
            yield elem['slug']
            
    def __len__(self):
        return len(self.creators)
    
    def __repr__(self):
        return str(self.creators)
        
    def scrape(self, limit=None, category=None, headers=superheaders, lang=None):
        '''
        returns creator in a list 
       Parameters
       ----------
           limit : the number of creators to get, if not specified return all authors.
                Authors are crawled in the order given by the site 
                N.B. if not specified can take some time 
           category : specifying a category will obtain only authors of that category 
               N.B. run pytipeee.show_categories() to see admissible values for category
       '''    
        
        def __params_setting(category=category, limit=limit, headers = headers, lang=lang):
            if limit != None: assert(type(limit)==int)
            if lang not in ['en','de','fr','es','it']: lang = 'en'
            mode = 'default'
            if category != None:
                if category in categories:
                    category = '&category='+str(categories[category])
                    mode = 'category'
                else:
                    print('wrong value for category: will not be considered')
            else:
                category=''
            return limit, mode, category, headers, lang
        
             # PARAMETHERS SETTING
        limit, mode ,category, headers, lang = __params_setting(category,limit,headers,lang)
        page='1'
        base_url = 'https://api.tipeee.com/v2.0/projects?mode={}&page={}&perPage=150&lang={}{}'
        creators_list = list()
        
            #COLLECTING DATA
        while len(creators_list) < limit:
            data = requesting(base_url.format(mode, page, lang, category), headers) 
            creators_list += data['items']
            try: 
                page = data['pager']['next']
            except: 
                break
        if len(creators_list) >  limit : creators_list = creators_list[:limit]
        
        self.scraped = creators_list
        self.get_creators()
        return creators_list
    
    
    def get_creators(self):
        ''' 
        paramether:
        ____________
        Transform each scraped item in an element of class Crator, so you can call methods on it
        
        '''
        if len(self.scraped) > 0 :
            for item in self.scraped:
                try :
                    self.creators.append(Creator(item))
                except: 
                    print(item)
                    break
                    
   
    
    
    def to_dataframe(self, lang=None):
        """
        return a pandas dataframe 
        
        PARAMS:
            lang: chose the lenguage for categories.
        
        """
            
        if len(self.creators)==0: return 
        import pandas 
        columns = ['id','username','lang','categories','tipperAmount','tipperNumber','newsNumber']
        df = pandas.DataFrame(columns=columns)
        for creator in self.creators:
            df = df.append(creator.to_dict(), ignore_index=True)
        df.set_index('id',inplace=True)   
        

In [3]:
show_categories()

other
bd-illustration
movies
food
geek
video-game
humour
journalism
books
fashion
music
photography
science-technology
performing-arts
sports
vlog
streaming


In [5]:
creators = Creators()          #Initialize the class
creators.scrape(100,'vlog')       #scrape the site using a limit of creatros to collect ans a category 
                                  #transform each scraped in a Creator element
df = creators.to_dataframe()      #return a pandas dataframe
df

{'translations': {'en': {'name': 'Octaviancarare'}}, 'id': 274497, 'slug': 'octaviancarare', 'lang': 'en', 'status': 'OPEN', 'created_at': '2021-04-29T10:25:18+02:00', 'tags': [], 'categories': [{'translations': {'de': {'name': 'Fotografie'}, 'en': {'name': 'Photo'}, 'es': {'name': 'Fotografía'}, 'fr': {'name': 'Photo'}, 'it': {'name': 'Fotografia'}}, 'id': 43, 'slug': 'photography', 'name': 'Photo'}, {'translations': {'de': {'name': 'Video Blog'}, 'en': {'name': 'Vlog'}, 'es': {'name': 'Videoblog'}, 'fr': {'name': 'Vlog'}, 'it': {'name': 'Vlog'}}, 'id': 47, 'slug': 'vlog', 'name': 'Vlog'}], 'avatar': {'id': 1996980, 'filename': '20210429608a6d957e646.jpeg', 'path': 'uploads/media/image/jpeg/20210429', 'updated_at': '2022-04-21T13:30:16+02:00'}, 'cover': {'id': 1996981, 'filename': '20210429608a6f262cbc6.jpeg', 'path': 'uploads/media/image/jpeg/20210429', 'updated_at': '2022-04-21T13:30:17+02:00'}, 'name': '', 'parameters': {'hidedAmount': False, 'tipperAmount': '7', 'tipperNumber': '2