# Data collection activities
All the data collection activities are automated using user defined functions retrievable in the folder `scripts`.

In [1]:
from bs4 import BeautifulSoup
import requests as requests
from scripts import user, item, discussion
import csv
from tqdm import tqdm 

## Discussion items
The following code saves the discussion items data into a csv file, provided a range of item codes fixed by the operator.

First of all we need to initialize all the files for data collection

In [2]:
file_path_discussion = "../data/discussion.csv"
row_head_discussion = ["Title",
            "Item code",
            "Banner data",
            "Body links",
            "Sats received by comments",
            "Comments item code",
            ]
    
with open(file_path_discussion, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_discussion)

In [3]:
# Try to scrape 150 different items rather than in 'progressive item mode'
from random import sample

sampled_items = sample([*range(1,200000)], 150)


In [28]:
print(sampled_items)

[60852, 30727, 141119, 135201, 74587, 43310, 37643, 139840, 85041, 102936, 145130, 64314, 10312, 154808, 77468, 96489, 163958, 189717, 37057, 16543, 67977, 158433, 71592, 6767, 178705, 15959, 62838, 131430, 148079, 20347, 113037, 183793, 193808, 104980, 2806, 183829, 117334, 87730, 78903, 181007, 112217, 167858, 86570, 28219, 93936, 129856, 186637, 61919, 34284, 32047, 46496, 20365, 144555, 119204, 43061, 149761, 28849, 157570, 16936, 98405, 45101, 127382, 191179, 31390, 37027, 121484, 17256, 141055, 138534, 86451, 179431, 137431, 185651, 94509, 106753, 15123, 14819, 21238, 186663, 95359, 30246, 12096, 71173, 125275, 85478, 74808, 62620, 82445, 51172, 115889, 34083, 157563, 141479, 107110, 75639, 62313, 128159, 188151, 180339, 42751, 6249, 86561, 105884, 39909, 42235, 199542, 139991, 44857, 36249, 158357, 79797, 177707, 89989, 186253, 112155, 14642, 160415, 181476, 180751, 108913, 111744, 75344, 22632, 3323, 84090, 190821, 109451, 176994, 116618, 48292, 138286, 2012, 64946, 166739, 710

In [4]:
for i in tqdm(sampled_items):
    try:
        # Provided a string returns a bs4.BeautifulSoup object
        url_posts = f'https://stacker.news/items/{i}'
        response = requests.get(url_posts)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        if item.detect_item_type(i, soup)=='discussion':
            entry = [discussion.extract_title(soup),
                     str(i),
                     discussion.extract_banner(soup),
                     discussion.extract_body_links(soup),
                     discussion.extract_comment_stacked(soup),
                     discussion.extract_comment_item_code(soup)
                     ]
            
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_discussion, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
    except:
        continue


100%|██████████| 150/150 [02:40<00:00,  1.07s/it]


In [29]:
# Tryout without item classification
for i in tqdm(sampled_items):
    try:
        url_posts = f'https://stacker.news/items/{i}'
        response = requests.get(url_posts)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
    
        entry = [discussion.extract_title(soup),
                 str(i),
                 discussion.extract_banner(soup),
                 discussion.extract_body_links(soup),
                 discussion.extract_comment_stacked(soup),
                 discussion.extract_comment_item_code(soup)
                 ]
        
        # Appends every new profile to a csv file in the provided path
        try:
            with open(file_path_discussion, 'a', encoding='utf_8_sig', newline="") as csvfile:
                csvwriter = csv.writer(csvfile)
                csvwriter.writerow(entry)
        except:
            print('Error while processing data')
            
    except:
        continue

100%|██████████| 150/150 [01:34<00:00,  1.60it/s]
