The purpose of this script is to compare darkweb market prices to street market prices, and to use this to estimate whether a vendor is selling wholesale or retail.

First, get the purity and quantity for each product review. Then standardize so that each listing is the cost per unit.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import compress

from datetime import datetime
from dateutil.parser import parse

import math
import os
import copy
import pickle

## Import data

In [2]:
file = open('drug_df', 'rb')
drug_df = pickle.load(file)

# Purity / Quantity extraction

In [3]:
import re

In [4]:
# Metrics to check for 
metrics = ['µg','¬µg','kg','kilo','pound','ounce','ug','¬ug','oz','lb','mcg','mg','g']
str_pattern = '' # initialize regex pattern
for m in metrics: # iteratively build regex pattern
    str_pattern += '[\d/.]+\s?' + m + '|'
#     str_pattern += '\d+\.?/?\d+\s?' + m + '|'
#     str_pattern += '\d*[\.?][\d*][\s?]' + m + '|' '\d+[/?][\d+][\s?]' + m + '|'
str_pattern = str_pattern[:-1] # trim last character
new_pat = re.compile(str_pattern, re.IGNORECASE) #compile the pattern

# Special characters to be removed
special_char =['`','\'','=','-','~','!','@','#','$','^','&','*','_','+','[',']','{','}',';','\\',':','|','<','<','>','?','//']

# Function to remove special characters
def remove_special_char(string):
    for char in special_char:
        string = string.replace(char,' ')
    string = string.replace(',','.')
    return(string)

# Define function to apply the pattern
def getMass(my_string):
    s = remove_special_char(my_string)
    match = new_pat.search(s)
    if match is not None:
        return(match.group().lower())
    else:
        return(match)
    
    
# Now try to find the count of each product description
# string = 'there are 30 TaBlets in the package'
countMetrics = ['pills','tabs','tablets','trips','amps','x','st','pc']
count_pattern = '' # initialize regex pattern
for m in countMetrics: # iteratively build regex pattern
    count_pattern += '\d+\s?' + m + '|'
count_pattern = count_pattern + 'x\s?\d+|\(\d+\)' 
count_pattern = re.compile(count_pattern, re.IGNORECASE)
number_pattern = re.compile('\d+',re.IGNORECASE) # pattern to extract the number from count
# Define function to apply the pattern
def getCount(my_string):
    s = remove_special_char(my_string)
    match = count_pattern.search(s)
    if match is not None:
        count = match.group().lower()
        return(float(number_pattern.search(count).group()))

Create `conversion` dictionary, containing conversion from every metric to grams

In [5]:
metrics = ['µg','¬µg','kg','kilo','pound','ounce','ug','¬ug','oz','lb','mcg','mg','g']
ug_metrics = ['µg','¬µg','ug','¬ug','mcg']
ug_rate = 1/1000000
mg_metrics = ['mg']
mg_rate = 1/1000
kg_metrics = ['kg','kilo']
kg_rate = 1000
oz_metrics = ['ounce','oz']
oz_rate = 28.35
lb_metrics = ['lb','pound']
lb_rate = 453.592

# Create conversion dictionary for each metric
ug_conversion_dict = {ug_metric:ug_rate for ug_metric in ug_metrics}
mg_conversion_dict = {mg_metric:mg_rate for mg_metric in mg_metrics}
kg_conversion_dict = {kg_metric:kg_rate for kg_metric in kg_metrics}
oz_conversion_dict = {oz_metric:oz_rate for oz_metric in oz_metrics}
lb_conversion_dict = {lb_metric:lb_rate for lb_metric in lb_metrics}

# Create 'master' conversion dictionary
conversion_dict = dict()
conversion_dict.update(ug_conversion_dict)
conversion_dict.update(mg_conversion_dict)
conversion_dict.update(kg_conversion_dict)
conversion_dict.update(oz_conversion_dict)
conversion_dict.update(lb_conversion_dict)
conversion_dict.update({'g':1})

In [6]:
# Custom function to convert string numbers to floats
# Deals with fractions, decimals, and whole numbers
def to_float(string):
    match = re.search(re.compile('/'), string)
    if match is not None:
        frac_split = string.split('/')
        dec = float(frac_split[0])/float(frac_split[1])
        return(dec) 
    else:
        return(float(string))

# Function to normalize the mass of drugs
metrics = ['µg','¬µg','kg','kilo','pound','ounce','ug','¬ug','oz','lb','mcg','mg','g']
quant = '2.5mg'
letter_pattern = re.compile('[^0-9./\s]+')
quant_pattern = re.compile('[.]?[0-9/]+[.]?[0-9]*')

# Function to convert product's mass to mass in grams
def normalize_mass(mass_string):
    # Args: string respresenting a product's mass (string contains unit)
#     Returns: float representing the product's mass in grams
    if mass_string is not None: # Check that the string exists
        if mass_string[0] != '/': # Valid mass measurements do not have '/' as first character
            metric_string = letter_pattern.search(mass_string) # search for the metric in mass_string
            if metric_string is not None: # check that mass_string contains a metric
                metric_string = metric_string.group() # get the metric
                rate = conversion_dict[metric_string] # get the rate to convert from given metric to grams
                quant_string = quant_pattern.search(mass_string) # search for the number from the mass string
                if quant_string is not None: # check that the number exists
                    quant_string = quant_string.group() # get the number
                    first_last = [quant_string[0], quant_string[-1]] # Get the first and last character
                    # Ensure that '/' is not the first or last character
                    if ('/' not in first_last) and (re.compile('\d').search(quant_string) is not None):
                        quant = to_float(quant_string) # convert quantity to float
                        return(quant * rate) # convert quantity to mass in grams

In [7]:
l = pd.DataFrame({'title':pd.unique(drug_df['product_description'])})
l['unit_mass'] = l['title'].apply(getMass)
l['unit_mass_grams'] = l['unit_mass'].apply(normalize_mass)
l['count'] = l['title'].apply(getCount)
l['total_mass'] = l['unit_mass_grams'] * l['count']
for idx, row in l.iterrows():
    if math.isnan(row['total_mass']) and not(math.isnan(row['unit_mass_grams'])):
        l.loc[idx,'total_mass'] = row['unit_mass_grams']
l.to_csv('l.csv')

##  Extract price information

In [8]:
from tqdm import tqdm

drug_df['unit_mass'] = drug_df['product_description'].apply(getMass)
drug_df['unit_mass_grams'] = drug_df['unit_mass'].apply(normalize_mass)
drug_df['count'] = drug_df['product_description'].apply(getCount)
drug_df['total_mass'] = drug_df['unit_mass_grams'] * drug_df['count']

for idx, row in tqdm(drug_df.iterrows()):
    if math.isnan(row['total_mass']) and not(math.isnan(row['unit_mass_grams'])):
        drug_df.loc[idx,'total_mass'] = row['unit_mass_grams']
        
# Now calculate $ per gram for each listing
drug_df['price_per_gram'] = drug_df['USD'] / drug_df['total_mass']

68378it [00:46, 1473.27it/s]


In [9]:
import pickle
file = open('drug_df_prices','wb')
pickle.dump(drug_df, file)
file.close()

In [11]:
drug_df.head()

Unnamed: 0,transaction_id,vendor_id,vendor_name,bitcoin_amt,USD,product_description,date,drug_prediction,unit_mass,unit_mass_grams,count,total_mass,price_per_gram
9430,'10528,590,superclr,-1.0,416.0,5 GRAMS GRADE AAA+ CARTEL TAN POWDER HEROIN,2018-11-02 06:43:01,heroin,5 g,5.0,,5.0,83.2
9431,'10529,590,superclr,-1.0,416.0,5 GRAMS GRADE AAA+ CARTEL TAN POWDER HEROIN,2018-11-10 06:43:01,heroin,5 g,5.0,,5.0,83.2
9434,'10532,92,OnionKings,-1.0,109.119,Rivotril 2mg från apotek (Roche) [100st],1969-12-31 19:00:00,benzodiazepines,2mg,0.002,100.0,0.2,545.595
9435,'10533,92,OnionKings,-1.0,109.119,Rivotril 2mg från apotek (Roche) [100st],2018-12-21 06:43:11,benzodiazepines,2mg,0.002,100.0,0.2,545.595
9436,'10534,92,OnionKings,-1.0,109.119,Rivotril 2mg från apotek (Roche) [100st],1969-12-31 19:00:00,benzodiazepines,2mg,0.002,100.0,0.2,545.595
