In [1]:
from bs4 import BeautifulSoup
import requests as requests
import time
import numpy as np
from scripts import user, item, discussion, link
from tqdm import tqdm 
import sqlite3

# Item extraction
Main steps in the code
1. Retrieve item webpage provided the item code
2. Detect item type:
    - Comment or post ?
    - If post, which kind of post:
        1. Discussion
        2. Link
        3. Poll
        4. Bounty
        5. Job
3. Retrieve title
4. Retrieve banner
    - Extract number of comment, **compulsory**
    - Extract stacked amount by the item, **if present**
    - Extract Boost value, **if present**
    - Extract username, **compulsory**
    - Extract timestamp, **compulsory**
    - Extract badge, **compulsory**
5. Extract amount stacked by comments, **compulsory**
6. Extract item code of comments **OR** extract user that commented

**Note that**:
- Some items do not have the stacked amount nor the possibility to receive sats. For example the user @saloon created all this kind of posts. Is he/she a bot? Is it an 'official bot' of the forum and so it's not possible to give sats to it?


## Scraping comments

[Useful link to duplicate rows according to the values in a row. In our situation, we need to duplicate rows according to the usernames or according to the comment item number](https://saturncloud.io/blog/splitting-and-expanding-pandas-dataframes-based-on-column-values/)

## Testing the extraction of item type

**Since job offers do not store a lot of data about interactions with users (very few comments, only a bunch of job offers into 300k items, etc) we could drop those and do not scrape them**
In fact you can see them by simply looking at the home page of stacker.news and filter for 'job'

In [None]:
for n in [277155, 127070, 277394, 278874, 277840, 239180, 235708, 260050]:
    url_posts = f'https://stacker.news/items/{n}'
    response = requests.get(url_posts)
    soup = BeautifulSoup(response.text, 'html.parser')
    print(n, item.detect_item_type(n, soup))


# Saving data in SQLite

In [None]:
# Connection creation and database creation
conn = sqlite3.connect('../data/stacker_news.sqlite')

cur = conn.cursor()
cur.execute('CREATE TABLE experiments (name VARCHAR, description VARCHAR)')
conn.commit()

conn.close()

In [None]:
# Add data to database
cur.execute('INSERT INTO experiments (name, description) VALUES (?, ?)',
            ('Another User', 'Another Experiment, even using " other characters"'))
conn.commit()

In [None]:
# Retrieve data from database
cur.execute('SELECT * FROM experiments')
data = cur.fetchall()
data

In [None]:
# Add constraints to datatypes
sql_command = """
DROP TABLE IF EXISTS experiments;
CREATE TABLE experiments (
    id INTEGER,
    name VARCHAR,
    description VARCHAR,
    PRIMARY KEY (id));
INSERT INTO experiments (name, description) values ("Aquiles", "My experiment description");
INSERT INTO experiments (name, description) values ("Aquiles 2", "My experiment description 2");
"""
cur.executescript(sql_command)
conn.commit()

In [None]:
# Retrieve data from database
cur.execute('SELECT * FROM experiments WHERE id=1')
data = cur.fetchone()
data

# Testing functions for script development

In [17]:
i = 280061
url_posts = f'https://stacker.news/items/{i}'
response = requests.get(url_posts)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

item.detect_item_link(soup)

True

In [18]:
i = 78569
url_posts = f'https://stacker.news/items/{i}'
response = requests.get(url_posts)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

res = {'title': link.extract_title(soup),
     'item': str(i),
     'banner': link.extract_banner(soup),
     'main link': link.extract_link(soup),
     'body links':link.extract_body_links(soup),
     'sats stacked by comments': link.extract_comment_stacked(soup),
     'comments item codes': link.extract_comment_item_code(soup),
     }

res

{'title': 'BNB Chain governance to decide what will happen to hacked funds - 600M BNB',
 'item': '78569',
 'banner': {'sats': '0 sats',
  'boost': None,
  'comments': '3 comments',
  'author': 'takaponka',
  'tag': 'bitcoin',
  'timestamp': '2022-10-07 12:17:28'},
 'main link': None,
 'body links': ['https://www.theblock.co/post/175503/bnb-chain-governance-to-decide-what-will-happen-to-hacked-funds'],
 'sats stacked by comments': '0 sats',
 'comments item codes': [78571, 78773, 78593]}

# Scraping user profiles
Users profiles are scraped starting from the list of users extracted by scraping all the items (posts+comments)
The link to get the user profile is `https://stacker.news/$username$` 

In [None]:
user_list = ['Monotone',
             'TNStacker',
             'kale',
             'DiracDelta',
             'kr',
             'moscowTimeBot',
             'mpuels',
             'blockstream_official',
             'nym',
             0,
             'random_',
             'saloon',
             "k00b",
             "utente che non esiste per niente",
             "DarthCoin",
             "Wumbo",
             "mf",
             "NoStranger",
             "anipy",
             "OneOneSeven",
             "Bitman",
             "nemo",
             "sahil",
             "prova_di_nullo",
             "babababa nullo",
             None,
             ]

user_list2 = ["k00b", "DarthCoin", "saloon"]

start = time.time()

user.save_profile_csv(user_list)

end = time.time()
print("The provided entries are ", len(user_list),"\nThe average time of execution of above program for every entry is :",
      (end-start)/len(user_list), "\nThe total time of execution is ", (end-start))

## Modularize the code - user profile scraping
Define functions to modularize and simplify the user profile scraping.
The functions are defined in `user_modules/scraping_user.py` and have been tested with `tests/test_scraping_user.py`. To run the tests open a the terminal/CMD and run the script `tests/test_scraping_user.py` from there, running it from a JupyterNotebook could raise errors.
The testing script tested every function in some corner cases (missing data/request error). 

**From the following two approaches we can create the final script for scraping the user profiles**.
The next step would be the creation of a function that loops through the user list and assigns to the rows in the dataframe the values returned from the functions defined in `user_modules/scraping_user.py`.

In [None]:
# TODO: integrate this method into the general scraping script for profile scraping

user_list = np.array(['Monotone',
                      'TNStacker',
                      'kale',
                      'DiracDelta',
                      'kr',
                      'moscowTimeBot',
                      'mpuels',
                      'blockstream_official',
                      'nym',
                      'random_'
                      ])
                      
start = time.time()

for i in np.nditer(user_list):
    print(user.get_profile(i))
    
end = time.time()
print("The average time of execution of the above program for one user is :",
      (end-start)/len(user_list), "s")

In [None]:
user_list = ['Monotone',
             'TNStacker',
             'kale',
             'DiracDelta',
             'kr',
             'moscowTimeBot',
             'mpuels',
             'blockstream_official',
             'nym',
             'random_',
             ]
                      
start = time.time()

for i in user_list:
    print(user.get_profile(i))
    
end = time.time()
print("The average time of execution of the above program for one user is :",
      (end-start)/len(user_list), "s")