# Refactored and Redesigned Scraper

## Scraper Super Class

In [1]:
import praw
import twitter
from os.path import isfile
import pandas as pd
from time import sleep


class Scraper:
    def __init__(self, query, csv, sort=None):
        self.query = query
        self.csv = csv
        self.sort = sort
        self.check_query_valid()
        self.get_posts()
        self.save_posts()
        
        
    def check_query_valid(self):
        if not isinstance(self.query, str):
            try:
                self.query = self.query.replace(" ", "%20")
            except AttributeError:
                raise
    
    
    def set_sort(self):
        raise NotImplementedException
        
        
    def parse_post(self, post):
        raise NotImplementedException
        
        
    def get_posts(self):
        self.sub_dict = {'game_title': [], 'post_title': [], 'id':[], 'score': []}
        csv = self.csv
        self.df, self.csv_loaded = (pd.read_csv(csv), 1) if isfile(csv) else ('', 0)
        print(self.query)
        feed = self.set_sort()
        for post in feed:
            self.parse_post(post)
          
        
    def save_posts(self):
        new_df = pd.DataFrame(self.sub_dict)

        # Add new_df to df if df exists then save it to a csv.
        if 'DataFrame' in str(type(self.df)):
            pd.concat([self.df, new_df], axis=0, sort=0).to_csv(self.csv, index=False)
            print(f'{len(new_df)} new posts collected and added to {self.csv}')
        else:
            new_df.to_csv(self.csv, index=False)
            print(f'{len(new_df)} posts collected and saved to {self.csv}')

## Subreddit Scraper Subclass

In [2]:
# Use your own tokens

class SubredditScraper(Scraper):
    def __init__(self, query,csv=f'game_titles.csv', sort='new'):
        self.scraper = praw.Reddit(
            client_id="tpkjzcHMAynTvw",
            client_secret="WdtCdrgnWMfZphmCgkl_fVFH-uA",
            user_agent="scraper",
            username="Whatchamacalit1",
            password="YEET"
        )
        Scraper.__init__(self, query, csv, sort)

        
    def set_sort(self):
        if self.sort == 'new':
            return self.scraper.subreddit('all').search(self.query, sort='new', time_filter='month')
        elif self.sort == 'top':
            return self.scraper.subreddit('all').search(self.query, sort='top', time_filter='month')
        elif self.sort == 'hot':
            return self.scraper.subreddit('all').search(self.query, sort='hot', time_filter='month')
        else:
            self.sort = 'relevance'
            print('Sort method was not recognized, defaulting to relevance.')
            return self.scraper.subreddit('all').search(self.query, time_filter='month')

        
    def parse_post(self, post):
        unique_id = post.id not in tuple(self.df.id) if self.csv_loaded else True

        # Save any unique, non-stickied posts with descriptions to sub_dict.
        if unique_id:
            self.sub_dict['game_title'].append(self.query)
            self.sub_dict['post_title'].append(post.title)
            self.sub_dict['id'].append(post.id)
            self.sub_dict['score'].append(post.score)
        sleep(0.1)

## Twitter Scraper Subclass

In [3]:
# Use your own tokens

class TwitterScraper(Scraper):
    def __init__(self, query,csv=f'game_titles_twitter.csv', sort='mixed'):
        self.scraper = twitter.Api(consumer_key='foYe2yl6efwHz9kJQiyI7vuSp',
                                   consumer_secret="fITQUjsbsC938lb5VQpzFaCuW6d3LXhC5e34AkCtL5xkIvj1BD",
                                   access_token_key='1110034001052471298-Bo7P9PV9bTlzIfxDT00DvW2TB4PsB8',
                                   access_token_secret='JXh4LO37BLVqyiAhrXvwlh8riHe9EhTEnpqtAbXejz1WR'
                                  )
        Scraper.__init__(self, query, csv, sort)
        

    def set_sort(self):
        ratelim = twitter.ratelimit.RateLimit()
        remaining = ratelim.get_limit("https://api.twitter.com/1.1/application/rate_limit_status.json").remaining
        if remaining <= 0:
            print("Exceeded Rate Limit")
            raise Error

        search = "q=" + self.query
        return self.scraper.GetSearch(raw_query=search)

        
    def parse_post(self, post):
        if post.retweeted_status is not None:
            notCopied = post.retweeted_status.text not in tuple(self.df.post_title) if self.csv_loaded else True
            if notCopied:
                print(post.retweeted_status.favorite_count)
                # Save any unique, non-stickied posts with descriptions to sub_dict.
                self.sub_dict['game_title'].append(self.query)
                self.sub_dict['post_title'].append(post.retweeted_status.text)
                self.sub_dict['id'].append('')
                self.sub_dict['score'].append(post.retweeted_status.favorite_count)

## Tests

In [4]:
import unittest
import os

class TestScrapingMethods(unittest.TestCase):
    def test_bad_input_reddit(self):
        self.assertRaises(AttributeError, SubredditScraper, query=123, sort='new')
        
    def test_good_input_reddit(self):
        SubredditScraper(query='Over Watch', sort='new').get_posts()
        self.assertTrue(os.path.isfile('./game_titles.csv'))
 
    def test_bad_input_twitter(self):
        self.assertRaises(AttributeError, TwitterScraper, query=123, sort='new')
        
    def test_good_input_twitter(self):
        TwitterScraper(query='Over Watch',sort='new').get_posts()
        self.assertTrue(os.path.isfile('./game_titles_twitter.csv'))
 

if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

..

Over Watch
0 new posts collected and added to game_titles.csv
Over Watch


.

Over Watch


  app.launch_new_instance()
.

11003
4
11003
76
97
0
6 posts collected and saved to game_titles_twitter.csv
Over Watch
97



----------------------------------------------------------------------
Ran 4 tests in 26.169s

OK
