# Data collection activities
All the data collection activities are automated using user defined functions retrievable in the folder `scripts`.

In [22]:
from bs4 import BeautifulSoup
import requests as requests
from scripts import user, item, discussion, link
import csv
from tqdm import tqdm 

# Item scraping
The following code saves items data into a csv file, provided a range of item codes fixed by the operator.

First of all we need to initialize all the files for data collection

## Initialization of csv files

In [23]:
# Discussion items
file_path_discussion = "../data/discussion.csv"
row_head_discussion = ["Title",
            "Item code",
            "Banner data",
            "Main link",
            "Body links"
            "Sats received by comments",
            "Comments item code",
            ]
    
with open(file_path_discussion, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_discussion)

In [24]:
# Link items
file_path_link = "../data/link.csv"
row_head_link = ["Title",
                 "Item code",
                 "Banner data",
                 "Main link",
                 "Body links",
                 "Sats received by comments",
                 "Comments item code",
                 ]

with open(file_path_link, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_link)


In [25]:
# Try to scrape 150 different items rather than in 'progressive item mode'
from random import sample

sampled_items = sample([*range(1,200000)], 150)


In [26]:
print(sampled_items)

[117051, 172917, 23628, 122322, 148790, 118583, 126295, 170115, 44573, 79661, 61754, 198407, 38996, 42937, 22607, 101291, 53826, 181380, 23293, 139928, 106326, 3493, 159436, 151087, 128425, 89287, 78252, 37951, 10085, 111728, 132099, 104125, 134642, 136172, 42156, 80426, 160539, 58713, 54915, 67151, 104360, 111163, 129251, 100956, 123294, 185415, 151718, 169572, 184756, 31511, 157624, 18738, 44112, 83132, 149515, 16285, 146816, 153359, 157612, 49047, 132319, 61280, 34743, 146305, 89583, 127563, 97234, 54849, 63692, 139817, 2250, 161890, 151022, 91955, 64231, 151172, 97236, 164425, 170670, 184669, 189817, 91620, 174945, 30635, 167266, 166996, 135345, 15968, 75857, 62972, 32651, 297, 55264, 47321, 171565, 5680, 168979, 61816, 51375, 99554, 5406, 92492, 109507, 49725, 180073, 197577, 117829, 184161, 58322, 96259, 47234, 170970, 154075, 55812, 150012, 141341, 127630, 165882, 43734, 48442, 115406, 45174, 70018, 121171, 122501, 7186, 38308, 183350, 168510, 171573, 114810, 143559, 172233, 183

In [27]:
for i in tqdm(sampled_items):
    try:
        # Provided a string returns a bs4.BeautifulSoup object
        url_posts = f'https://stacker.news/items/{i}'
        response = requests.get(url_posts)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        if item.detect_item_type(i, soup)=='discussion':
            entry = [discussion.extract_title(soup),
                     str(i),
                     discussion.extract_banner(soup),
                     discussion.extract_body_links(soup),
                     discussion.extract_comment_stacked(soup),
                     discussion.extract_comment_item_code(soup)
                     ]
            
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_discussion, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
        if item.detect_item_type(i, soup)=='link':
            entry = [link.extract_title(soup),
                     str(i),
                     link.extract_banner(soup),
                     link.extract_link(soup),
                     link.extract_body_links(soup),
                     link.extract_comment_stacked(soup),
                     link.extract_comment_item_code(soup)
                     ]
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_link, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
    except:
        continue


100%|██████████| 150/150 [03:10<00:00,  1.27s/it]
