In [181]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import json
from pymongo import MongoClient

In [182]:
# List of user agent headers
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
]

In [None]:
# Select a random user agent
random_user_agent = random.choice(user_agents)

# Assign user agent to header
header = {'User-Agent': random_user_agent}

url = 'https://psychonautwiki.org/wiki/Psychoactive_substance_index'
r = requests.get(url, headers=header)
r.status_code
soup = BeautifulSoup(r.text, 'lxml')

# Extract substance links
item_divs = soup.find_all('div', {'class':'panel radius'})
item_list = [item_div.find_all('li') for item_div in item_divs]
all_ul_elements = [[li.find('ul') for li in item if li.find('ul')] for item in item_list]

# Flatten the list of 'ul' elements
flat_ul_elements = [ul for sublist in all_ul_elements for ul in sublist if ul]

# For each 'ul', extract all 'href' attributes from 'a' tags
href_list = [a['href'] for ul in flat_ul_elements for a in ul.find_all('a')]
filtered_href = [href for href in href_list if href.startswith("/wiki")]

# Construct full links
psych_links = [f"https://psychonautwiki.org{href}" for href in filtered_href]

In [None]:
# Initialize empty list to store all substance dictionaries
all_substances = []

# Iterate through each substance link
for url in psych_links:
    
    # Select a random user agent
    random_user_agent = random.choice(user_agents)
    
    # Assign user agent to header
    header = {'User-Agent': random_user_agent}

    # Send a request and parse its html content
    r = requests.get(url, headers=header)
    soup = BeautifulSoup(r.text, 'lxml')


    # Extract data from the summary sheet
    tbody = soup.find('tbody')
    summary_link = tbody.find('a')['href'] if tbody and tbody.find('a') and tbody.find('a').has_attr('href') else None
    if summary_link:
        summary_url = f"https://psychonautwiki.org{summary_link}"
        
        r = requests.get(summary_url, headers=header)
        soup = BeautifulSoup(r.text, 'lxml')
        
        # Substance name
        substance = soup.find('title').text.split('/')[0] if soup.find('title') else "Unknown"
        
        # Classification
        divs = soup.find_all('div', {'class':'flex-column header-only summary-panel'})
        psych_class = divs[0].text.split(':')[1].strip() if len(divs) > 0 else None
        chemical_class = divs[1].text.split(':')[1].strip() if len(divs) > 1 else None

        # Extract information for the substance
        feature_lists = soup.find_all('li', {'class':'featured list-item'})
        
        duration_values = None
        safety = None
        tolerance = None
        effects = None
        
        if feature_lists:
            # Extract duration info
            duration_list = feature_lists[1] if len(feature_lists) > 1 else None
            if duration_list:
                duration_tags = duration_list.find_all('li')
                id_values = [i.text for i in duration_tags if i.text]
                duration_values = [{value.split(':')[0]: value.split(":")[1] if ':' in value else None} for value in id_values] if id_values else None

            # Safety profile
            harm_info = feature_lists[2] if len(feature_lists) > 2 else None
            if harm_info:
                hs = harm_info.text.split('\n')
                safety = ','.join(hs).strip(',')

            # Tolerance
            tolerance_info = feature_lists[3] if len(feature_lists) > 3 else None
            if tolerance_info:
                tol = tolerance_info.text.split('\n')
                tolerance = ','.join(tol).strip(',')

            # Subjective effects
            effs = soup.find_all('li',{'class':'smw-row'})  
            effects = [i.text for i in effs] if effs else None
            
        # Create a dictionary for each substance
        substance_info = {
            "Substance": substance,
            "Psychoactive Class": psych_class,
            "Chemical class": chemical_class,
            "Duration": duration_values,
            "Safety Profile": safety,
            "Tolerance": tolerance,
            "Subjective Effects": effects
        }
        
        # Append each dictionary to all substances list
        all_substances.append(substance_info)

# Filter out dictionaries with None values for any key
all_substances = [substance for substance in all_substances if all(value is not None for value in substance.values())]

# Sort substances by the 'Substance' key
sorted_substances = sorted(all_substances, key=lambda x: x['Substance'])
len(sorted_substances)

In [None]:
# import json
# from pymongo import MongoClient


# # Convert the sorted list to JSON format
# json_data = json.dumps(sorted_substances)

# # Connect to MongoDB (Update 'your_connection_string' with your actual connection string)
# client = MongoClient('your_connection_string')

# # Select the database and collection
# db = client['your_database_name']
# collection = db['your_collection_name']

# # Convert JSON data to a Python dictionary and insert into MongoDB
# for substance in json.loads(json_data):
#     collection.insert_one(substance)

# # Close the connection
# client.close()