The purpose of this script is to compare darkweb market prices to street market prices, and to use this to estimate whether a vendor is selling wholesale or retail.

First, get the purity and quantity for each product review. Then standardize so that each listing is the cost per unit.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import compress

from datetime import datetime
from dateutil.parser import parse

import math
import os
import copy
import pickle

## Import data

In [2]:
file = open('drug_df', 'rb')
drug_df = pickle.load(file)

## Extract amount information
First, extract quantity of each listing

In [3]:
drug_df.head()

Unnamed: 0,transaction_id,vendor_id,vendor_name,product_description,date,drug_prediction
9430,'10528,590,superclr,5 GRAMS GRADE AAA+ CARTEL TAN POWDER HEROIN,2018-11-02 06:43:01,heroin
9431,'10529,590,superclr,5 GRAMS GRADE AAA+ CARTEL TAN POWDER HEROIN,2018-11-10 06:43:01,heroin
9435,'10533,92,OnionKings,Rivotril 2mg från apotek (Roche) [100st],2018-12-21 06:43:11,benzodiazepines
9437,'10535,92,OnionKings,Rivotril 2mg från apotek (Roche) [100st],2018-12-22 06:43:11,benzodiazepines
9439,'10537,92,OnionKings,Rivotril 2mg från apotek (Roche) [100st],2018-12-01 06:43:11,benzodiazepines


# Purity / Quantity extraction

In [4]:
import re

In [5]:
# def tokenize(string):
#     return remove_special_char(string).split()

In [6]:
import re

# Special characters to be removed
special_char =['`','\'','=','-','~','!','@','#','$','^','&','*','(',')','_','+','[',']','{','}',';','\\',':','|','<','<','>','?','//']

# Function to remove special characters
def remove_special_char(string):
    for char in special_char:
        string = string.replace(char,' ')
    return(string)

# Tokenize the title (including measurements)
def tokenize(string):
    # First remove special characters and split words
    old_tokens = remove_special_char(string.lower()).split()
    new_tokens = [] # list to hold refined tokens
    for token in old_tokens:
        match = re.match(r"([0-9,.]+)([a-z,%]+)", token, re.I) # look for measurements
        if match is not None: # if there is a match
            temp = [match.group(1),match.group(2)] # get the groups
            new_tokens += temp # add to the list of new tokens
        else:
            new_tokens.append(token) # otherwise keep the old token
    return(new_tokens)

# Get purity
purity_metrics = ['%','percent']
def getPurity(tokenized_title):
    prev = '' # initialize previous token
    purity = '' # initialize quantity
    for token in tokenized_title:
        if token in purity_metrics:
            purity = prev+token # assume previous token is quantity
        prev = token
    return purity

# Get quantity
quantity_metrics = ['kg','g','g.','mg','mcg','ug','oz','lb', 'gram', 'gr','gr.', 'grams','pound','kilogram','ml']
def getQuantity(tokenized_title):
    prev = '' # initialize previous token
    quantity = '' # initialize quantity
    for token in tokenized_title:
        if token in quantity_metrics:
            quantity = prev+token # assume previous token is quantity
        prev = token
    return quantity

In [7]:
drug_df['tokenized_titles'] = drug_df['product_description'].apply(tokenize)
drug_df['quantity'] = drug_df['tokenized_titles'].apply(getQuantity)
drug_df['purity'] = drug_df['tokenized_titles'].apply(getPurity)

In [8]:
drug_df.to_csv('test_file.csv')

In [9]:
def parse_titles(title):
    tokens = parser(title.lower())
    tokens = [token.orth_ for token in tokens if not token.orth_.isspace()]
    return(tokens)

import re

metrics = ['kg ','g ','mg ','ug ','oz ','lb ', 'gram ', 'gr ', 'grams ','mcg ','tabs']

def parse_titles2(title):
    quantity = []
    for metric in metrics:
        match_index = title.find(metric)
        if match_index != -1:
            print(len(metric))
            quantity.append(title[match_index-6:match_index+len(metric)])
    return(quantity)


def parse_titles3(title):
    return title[title.find('%')-2:title.find('%')+1]

In [10]:
# Metrics to check for 
metrics = ['µg','¬µg','kg','kilo','pound','ounce','ug','¬ug','oz','lb','mcg','mg','g']
str_pattern = '' # initialize regex pattern
for m in metrics: # iteratively build regex pattern
    str_pattern += '[\d/.]+\s?' + m + '|'
#     str_pattern += '\d+\.?/?\d+\s?' + m + '|'
#     str_pattern += '\d*[\.?][\d*][\s?]' + m + '|' '\d+[/?][\d+][\s?]' + m + '|'
str_pattern = str_pattern[:-1] # trim last character
new_pat = re.compile(str_pattern, re.IGNORECASE) #compile the pattern

# Special characters to be removed
special_char =['`','\'','=','-','~','!','@','#','$','^','&','*','(',')','_','+','[',']','{','}',';','\\',':','|','<','<','>','?','//']

# Function to remove special characters
def remove_special_char(string):
    for char in special_char:
        string = string.replace(char,' ')
    string = string.replace(',','.')
    return(string)

# Define function to apply the pattern
def getMass(my_string):
    s = remove_special_char(my_string)
    match = new_pat.search(s)
    if match is not None:
        return(match.group().lower())
    else:
        return(match)
    
    
# Now try to find the count of each product description
# string = 'there are 30 TaBlets in the package'
countMetrics = ['pills','tabs','tablets','x','st','pc']
count_pattern = '' # initialize regex pattern
for m in countMetrics: # iteratively build regex pattern
    count_pattern += '\d+\s?' + m + '|'
count_pattern = count_pattern + 'x\s?\d+' 
count_pattern = re.compile(count_pattern, re.IGNORECASE)
number_pattern = re.compile('\d+',re.IGNORECASE) # pattern to extract the number from count
# Define function to apply the pattern
def getCount(my_string):
    s = remove_special_char(my_string)
    match = count_pattern.search(s)
    if match is not None:
        count = match.group().lower()
        return(float(number_pattern.search(count).group()))

Create `conversion` dictionary, containing conversion from every metric to grams

In [11]:
metrics = ['µg','¬µg','kg','kilo','pound','ounce','ug','¬ug','oz','lb','mcg','mg','g']
ug_metrics = ['µg','¬µg','ug','¬ug','mcg']
ug_rate = 1/1000000
mg_metrics = ['mg']
mg_rate = 1/1000
kg_metrics = ['kg','kilo']
kg_rate = 1000
oz_metrics = ['ounce','oz']
oz_rate = 28.35
lb_metrics = ['lb','pound']
lb_rate = 453.592

# Create conversion dictionary for each metric
ug_conversion_dict = {ug_metric:ug_rate for ug_metric in ug_metrics}
mg_conversion_dict = {mg_metric:mg_rate for mg_metric in mg_metrics}
kg_conversion_dict = {kg_metric:kg_rate for kg_metric in kg_metrics}
oz_conversion_dict = {oz_metric:oz_rate for oz_metric in oz_metrics}
lb_conversion_dict = {lb_metric:lb_rate for lb_metric in lb_metrics}

# Create 'master' conversion dictionary
conversion_dict = dict()
conversion_dict.update(ug_conversion_dict)
conversion_dict.update(mg_conversion_dict)
conversion_dict.update(kg_conversion_dict)
conversion_dict.update(oz_conversion_dict)
conversion_dict.update(lb_conversion_dict)
conversion_dict.update({'g':1})

In [12]:
def to_float(string):
    match = re.search(re.compile('/'), string)
    if match is not None:
        frac_split = string.split('/')
        dec = float(frac_split[0])/float(frac_split[1])
        return(dec) 
    else:
        return(float(string))

# Function to normalize the mass of drugs
metrics = ['µg','¬µg','kg','kilo','pound','ounce','ug','¬ug','oz','lb','mcg','mg','g']
quant = '2.5mg'
letter_pattern = re.compile('[^0-9./\s]+')
quant_pattern = re.compile('[.]?[0-9/]+[.]?[0-9]*')

def normalize_mass(mass_string):
    if mass_string is not None:
        if mass_string[0] == '/':
            return(None)
        else:
            metric_string = letter_pattern.search(mass_string)
            if metric_string is not None:
                metric_string = metric_string.group()
                rate = conversion_dict[metric_string]
                quant_string = quant_pattern.search(mass_string)
                if quant_string is not None:
                    quant_string = quant_string.group()
                    first_last = [quant_string[0], quant_string[-1]]
                    if ('/' not in first_last) and (re.compile('\d').search(quant_string) is not None):
                        quant = to_float(quant_string)
                        return(quant * rate)

In [13]:
l = pd.DataFrame({'title':pd.unique(drug_df['product_description'])})
# l['tokenized_titles'] = l['title'].apply(tokenize)
# l['quantity1'] = l['tokenized_titles'].apply(getQuantity)
l['unit_mass'] = l['title'].apply(getMass)
l['unit_mass_grams'] = l['unit_mass'].apply(normalize_mass)
l['count'] = l['title'].apply(getCount)
l['total_mass'] = l['unit_mass_grams'] * l['count']
for idx, row in l.iterrows():
    if math.isnan(row['total_mass']) and not(math.isnan(row['unit_mass_grams'])):
        l.loc[idx,'total_mass'] = row['unit_mass_grams']
l.to_csv('l.csv')