In [3]:
from bs4 import BeautifulSoup
import requests as requests
import time
import numpy as np
from scripts import user, item, discussion
from tqdm import tqdm 
import sqlite3

# Item extraction
Main steps in the code
1. Retrieve item webpage provided the item code
2. Detect item type:
    - Comment or post ?
    - If post, which kind of post:
        1. Discussion
        2. Link
        3. Poll
        4. Bounty
        5. Job
3. Retrieve title
4. Retrieve banner
    - Extract number of comment, **compulsory**
    - Extract stacked amount by the item, **if present**
    - Extract Boost value, **if present**
    - Extract username, **compulsory**
    - Extract timestamp, **compulsory**
    - Extract badge, **compulsory**
5. Extract amount stacked by comments, **compulsory**
6. Extract item code of comments **OR** extract user that commented

**Note that**:
- Some items do not have the stacked amount nor the possibility to receive sats. For example the user @saloon created all this kind of posts. Is he/she a bot? Is it an 'official bot' of the forum and so it's not possible to give sats to it?


## Scraping comments

[Useful link to duplicate rows according to the values in a row. In our situation, we need to duplicate rows according to the usernames or according to the comment item number](https://saturncloud.io/blog/splitting-and-expanding-pandas-dataframes-based-on-column-values/)

## Testing the extraction of item type

**Since job offers do not store a lot of data about interactions with users (very few comments, only a bunch of job offers into 300k items, etc) we could drop those and do not scrape them**
In fact you can see them by simply looking at the home page of stacker.news and filter for 'job'

In [7]:
for n in [277155, 127070, 277394, 278874, 277840, 239180, 235708, 260050]:
    url_posts = f'https://stacker.news/items/{n}'
    response = requests.get(url_posts)
    soup = BeautifulSoup(response.text, 'html.parser')
    print(n, item.detect_item_type(n, soup))


1 link
2 discussion
3 comment
4 comment
5 comment
6 comment
7 comment
8 link
9 comment
10 comment
11 comment
12 comment
13 comment
14 comment
15 comment
16 comment
17 comment
18 comment
19 comment
20 comment
21 link
22 comment
23 comment
24 comment
25 comment
26 link
27 comment
28 comment
29 comment
30 comment
31 link
32 comment
33 comment
34 link
35 link
36 link
37 comment
38 link
39 link
40 link
41 link
42 link
43 comment
44 comment
45 link
46 comment
47 link
48 comment
49 link
50 comment
51 link
52 link
53 link
54 link
55 comment
56 link
57 link
58 link
59 link
60 link
61 link
62 link
63 link
64 comment
65 comment
66 link
67 comment
68 comment
69 comment
70 link
71 comment
72 link
73 comment
74 comment
75 link
76 link
77 link
78 link
79 comment
80 link
81 link
82 link
83 link
84 link
85 link
86 link
87 comment
88 link
89 link
90 link
91 link
92 link
93 comment
94 link
95 comment
96 comment
97 comment
98 comment
99 link
100 discussion
101 comment
102 comment
103 comment
104 comment
1

# Saving data in SQLite

In [None]:
# Connection creation and database creation
conn = sqlite3.connect('../data/stacker_news.sqlite')

cur = conn.cursor()
cur.execute('CREATE TABLE experiments (name VARCHAR, description VARCHAR)')
conn.commit()

conn.close()

In [8]:
# Add data to database
cur.execute('INSERT INTO experiments (name, description) VALUES (?, ?)',
            ('Another User', 'Another Experiment, even using " other characters"'))
conn.commit()

In [10]:
# Retrieve data from database
cur.execute('SELECT * FROM experiments')
data = cur.fetchall()
data

[('Another User', 'Another Experiment, even using " other characters"')]

In [12]:
# Add constraints to datatypes
sql_command = """
DROP TABLE IF EXISTS experiments;
CREATE TABLE experiments (
    id INTEGER,
    name VARCHAR,
    description VARCHAR,
    PRIMARY KEY (id));
INSERT INTO experiments (name, description) values ("Aquiles", "My experiment description");
INSERT INTO experiments (name, description) values ("Aquiles 2", "My experiment description 2");
"""
cur.executescript(sql_command)
conn.commit()

In [14]:
# Retrieve data from database
cur.execute('SELECT * FROM experiments WHERE id=1')
data = cur.fetchone()
data

(1, 'Aquiles', 'My experiment description')

# Scraping user profiles
Users profiles are scraped starting from the list of users extracted by scraping all the items (posts+comments)
The link to get the user profile is `https://stacker.news/$username$` 

In [None]:
user_list = ['Monotone',
             'TNStacker',
             'kale',
             'DiracDelta',
             'kr',
             'moscowTimeBot',
             'mpuels',
             'blockstream_official',
             'nym',
             0,
             'random_',
             'saloon',
             "k00b",
             "utente che non esiste per niente",
             "DarthCoin",
             "Wumbo",
             "mf",
             "NoStranger",
             "anipy",
             "OneOneSeven",
             "Bitman",
             "nemo",
             "sahil",
             "prova_di_nullo",
             "babababa nullo",
             None,
             ]

user_list2 = ["k00b", "DarthCoin", "saloon"]

start = time.time()

user.save_profile_csv(user_list)

end = time.time()
print("The provided entries are ", len(user_list),"\nThe average time of execution of above program for every entry is :",
      (end-start)/len(user_list), "\nThe total time of execution is ", (end-start))

## Modularize the code - user profile scraping
Define functions to modularize and simplify the user profile scraping.
The functions are defined in `user_modules/scraping_user.py` and have been tested with `tests/test_scraping_user.py`. To run the tests open a the terminal/CMD and run the script `tests/test_scraping_user.py` from there, running it from a JupyterNotebook could raise errors.
The testing script tested every function in some corner cases (missing data/request error). 

**From the following two approaches we can create the final script for scraping the user profiles**.
The next step would be the creation of a function that loops through the user list and assigns to the rows in the dataframe the values returned from the functions defined in `user_modules/scraping_user.py`.

In [None]:
# TODO: integrate this method into the general scraping script for profile scraping

user_list = np.array(['Monotone',
                      'TNStacker',
                      'kale',
                      'DiracDelta',
                      'kr',
                      'moscowTimeBot',
                      'mpuels',
                      'blockstream_official',
                      'nym',
                      'random_'
                      ])
                      
start = time.time()

for i in np.nditer(user_list):
    print(user.get_profile(i))
    
end = time.time()
print("The average time of execution of the above program for one user is :",
      (end-start)/len(user_list), "s")

In [None]:
user_list = ['Monotone',
             'TNStacker',
             'kale',
             'DiracDelta',
             'kr',
             'moscowTimeBot',
             'mpuels',
             'blockstream_official',
             'nym',
             'random_',
             ]
                      
start = time.time()

for i in user_list:
    print(user.get_profile(i))
    
end = time.time()
print("The average time of execution of the above program for one user is :",
      (end-start)/len(user_list), "s")