# Data collection activities
All the data collection activities are automated using user defined functions retrievable in the folder `scripts`.

In [9]:
from bs4 import BeautifulSoup
import requests as requests
from scripts import user, item, discussion, link, poll, bounty
import csv
from tqdm import tqdm 

# Item scraping
The following code saves items data into a csv file, provided a range of item codes fixed by the operator.

First of all we need to initialize all the files for data collection

## Initialization of csv files

In [10]:
# Discussion items
file_path_discussion = "../data/discussion.csv"
row_head_discussion = ["Title",
            "Item code",
            "Banner data",
            "Main link",
            "Body links"
            "Sats received by comments",
            "Comments item code",
            ]
    
with open(file_path_discussion, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_discussion)

In [11]:
# Link items
file_path_link = "../data/link.csv"
row_head_link = ["Title",
                 "Item code",
                 "Banner data",
                 "Main link",
                 "Body links",
                 "Sats received by comments",
                 "Comments item code",
                 ]

with open(file_path_link, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_link)


In [12]:
# Poll items
file_path_poll = "../data/poll.csv"
row_head_poll = ["Title",
                 "Item code",
                 "Banner data",
                 "Body links",
                 "Sats received by comments",
                 "Comments item code",
                 ]

with open(file_path_poll, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_poll)


In [13]:
# Bounty items
file_path_bounty = "../data/bounty.csv"
row_head_bounty = ["Title",
                 "Item code",
                 "Banner data",
                 "Body links",
                 "Sats received by comments",
                 "Comments item code",
                 ]

with open(file_path_bounty, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_bounty)

In [14]:
# Try to scrape 150 different items rather than in 'progressive item mode'
from random import sample

sampled_items = sample([*range(1,200000)], 200)


In [15]:
print(sampled_items)

[4122, 101221, 119598, 45764, 25264, 165557, 67921, 66819, 173882, 137321, 195291, 117750, 50240, 8846, 133159, 91006, 120190, 14819, 123144, 173231, 173563, 55963, 195912, 160645, 197813, 14092, 12903, 141529, 122120, 14060, 85310, 53511, 130617, 151593, 68220, 48864, 67804, 168603, 115590, 58944, 196704, 3349, 175045, 174626, 114051, 189467, 43844, 189433, 51442, 35397, 132747, 89567, 67741, 95651, 109942, 85799, 182477, 165126, 68575, 180039, 167063, 103776, 20753, 16515, 15962, 40173, 184196, 3601, 176378, 186725, 22257, 110766, 101046, 99426, 198485, 160150, 120309, 17986, 63270, 109122, 168077, 160886, 157929, 16089, 34721, 131683, 39687, 11549, 151099, 125437, 183017, 9432, 13034, 111302, 184927, 75044, 189457, 89402, 27708, 137628, 114098, 184876, 151194, 88696, 62230, 92218, 176718, 171402, 88079, 172689, 18559, 26050, 59447, 69385, 142337, 1384, 102788, 23991, 40962, 140192, 61835, 15391, 180066, 144121, 103063, 99926, 77336, 158473, 124152, 47432, 157672, 60727, 152227, 8090

In [16]:
for i in tqdm(sampled_items):
    try:
        # Provided a string returns a bs4.BeautifulSoup object
        url_posts = f'https://stacker.news/items/{i}'
        response = requests.get(url_posts)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        if item.detect_item_type(i, soup)=='discussion':
            entry = [discussion.extract_title(soup),
                     str(i),
                     discussion.extract_banner(soup),
                     discussion.extract_body_links(soup),
                     discussion.extract_comment_stacked(soup),
                     discussion.extract_comment_item_code(soup)
                     ]
            
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_discussion, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
        elif item.detect_item_type(i, soup)=='link':
            entry = [link.extract_title(soup),
                     str(i),
                     link.extract_banner(soup),
                     link.extract_link(soup),
                     link.extract_body_links(soup),
                     link.extract_comment_stacked(soup),
                     link.extract_comment_item_code(soup)
                     ]
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_link, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
                
        elif item.detect_item_type(i, soup)=='poll':
            entry = [poll.extract_title(soup),
                     str(i),
                     poll.extract_banner(soup),
                     poll.extract_body_links(soup),
                     poll.extract_comment_stacked(soup),
                     poll.extract_comment_item_code(soup)
                     ]
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_poll, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
        elif item.detect_item_type(i, soup)=='bounty':
            entry = [bounty.extract_title(soup),
                     str(i),
                     bounty.extract_banner(soup),
                     bounty.extract_body_links(soup),
                     bounty.extract_comment_stacked(soup),
                     bounty.extract_comment_item_code(soup)
                     ]
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_bounty, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
    except:
        continue


100%|██████████| 200/200 [02:27<00:00,  1.36it/s]
