In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import compress

import datetime
import calendar
from dateutil.parser import parse
from matplotlib import rc

import math
import os
import copy
import pickle

import itertools
import operator

In [2]:
# Read in data (from pickle file)
file = open('drug_data','rb')
drug_data = pickle.load(file)

In [3]:
drug_data.tail()

Unnamed: 0,transaction_id,vendor_id,vendor_name,bitcoin,USD,product_description,date,drug_prediction,rating,feedback,market_id,buyer_name,drug_cats
119927,'121025,2031,SweetTreats,0,61.0,3 Piece - 61 USD - HOLIDAY SALE 17/EA | Distil...,2019-01-01 09:40:00,[marijuana],5.0,great carts will buy again!,2,S***0,marijuana
119928,'121026,2031,SweetTreats,0,78.0,4 Piece - 78 USD - HOLIDAY SALE 17/EA | Distil...,2018-12-31 12:22:00,[marijuana],5.0,great carts will buy again!,2,c***9,marijuana
119929,'121027,2031,SweetTreats,0,61.0,3 Piece - 61 USD - HOLIDAY SALE 17/EA | Distil...,2018-12-30 11:09:00,[marijuana],4.0,great carts will buy again!,2,c***9,marijuana
119930,'121028,2031,SweetTreats,0,112.0,6 Piece - 112 USD - HOLIDAY SALE 17/EA | Disti...,2018-12-29 08:53:00,[marijuana],5.0,"Great value, good quality, quick shipping -- t...",2,s***r,marijuana
119932,'121030,2306,majordan,0,-1.0,1 Piece - 240 EUR - 1 bar of hublot pollen hash,2018-12-28 04:38:00,[hashish],5.0,#####,2,b***w,hashish


## Get all the unique drugs for each vendor

In [4]:
#Get all the unique buyer names 
_ , unique_buyers = pd.factorize(drug_data['buyer_name'])

In [5]:
unique_buyers

Index(['P*****d', 'B*****u', 'z*****d', 'b*****4', 'K*****z', 'M*****h',
       'G*****4', 't*****1', 'w*****y', 'D*****5',
       ...
       'zrc5004', 'level', 'terrybones', 'hoho76', 'P***y', 'B***3', 'z***c',
       'j***g', 'T***o', 'g***3'],
      dtype='object', length=4400)

In [6]:
#Get dataframe with buyer names, market id, drug prediction
buyer_and_drug = copy.deepcopy(drug_data.iloc[:,[10,11,5,7]])

buyer_and_drug.columns = ['market_id','buyer_name','product_description','drug_prediction']

In [7]:
buyer_and_drug.tail()

Unnamed: 0,market_id,buyer_name,product_description,drug_prediction
119927,2,S***0,3 Piece - 61 USD - HOLIDAY SALE 17/EA | Distil...,[marijuana]
119928,2,c***9,4 Piece - 78 USD - HOLIDAY SALE 17/EA | Distil...,[marijuana]
119929,2,c***9,3 Piece - 61 USD - HOLIDAY SALE 17/EA | Distil...,[marijuana]
119930,2,s***r,6 Piece - 112 USD - HOLIDAY SALE 17/EA | Disti...,[marijuana]
119932,2,b***w,1 Piece - 240 EUR - 1 bar of hublot pollen hash,[hashish]


In [8]:
#function to convert list to string
def flatten(drug_list):
    str1 = ''.join(drug_list)
    return str1

#convert predictions from list to strings
buyer_and_drug['drug_cats'] = buyer_and_drug.iloc[:,3].apply(flatten)

In [9]:
buyer_and_drug.head()

Unnamed: 0,market_id,buyer_name,product_description,drug_prediction,drug_cats
0,3,P*****d,? MDMA very high QUALITY 87%?The Best MDMA for...,[ecstasy],ecstasy
1,3,B*****u,? MDMA very high QUALITY 87%?The Best MDMA for...,[ecstasy],ecstasy
2,3,z*****d,? MDMA very high QUALITY 87%?The Best MDMA for...,[ecstasy],ecstasy
3,3,b*****4,? MDMA very high QUALITY 87%?The Best MDMA for...,[ecstasy],ecstasy
4,3,K*****z,? MDMA very high QUALITY 87%?The Best MDMA for...,[ecstasy],ecstasy


In [10]:
#Define a function to get all drug types for each buyer_name
def getDrugForBuyer(buyer):
    
    buyer_subset = buyer_and_drug[buyer_and_drug['buyer_name'] == buyer]

    drug_pred_list = list(buyer_subset['drug_cats'])
    
    return drug_pred_list

In [11]:
#Create a dataframe with all the unique buyer names
buyer_df = pd.DataFrame(unique_buyers)
buyer_df.columns = ['buyer_name']

In [12]:
#Get all the list of drugs predictions for each vendor
drug_lists = []

for buyer in unique_buyers:
    buy = getDrugForBuyer(buyer)
    drug_lists.append(buy)

In [13]:
#fill dataframe with drug_prediction for each buyer
buyer_df['list_of_drugs'] = drug_lists

In [14]:
buyer_df.head()

Unnamed: 0,buyer_name,list_of_drugs
0,P*****d,"[ecstasy, ecstasy, lsd, ecstasy, ecstasy, hero..."
1,B*****u,"[ecstasy, methamphetamine, methamphetamine, me..."
2,z*****d,"[ecstasy, ecstasy, ecstasy, ecstasy, ecstasy, ..."
3,b*****4,"[ecstasy, ecstasy, not_drugs, not_drugs, not_d..."
4,K*****z,"[ecstasy, ecstasy]"


In [15]:
def getUniqueDrug(drug_list):
    unique_drugs = list(set(drug_list))
    
    return unique_drugs

In [16]:
#Get all the unique drugs for each buyer
buyer_df['unique_drug_predictions'] = buyer_df.iloc[:,1].apply(getUniqueDrug)

In [17]:
buyer_df.head()

Unnamed: 0,buyer_name,list_of_drugs,unique_drug_predictions
0,P*****d,"[ecstasy, ecstasy, lsd, ecstasy, ecstasy, hero...","[lsd, ecstasy, not_drugs, heroin]"
1,B*****u,"[ecstasy, methamphetamine, methamphetamine, me...","[ecstasy, marijuana, psychedelic, lsd, methamp..."
2,z*****d,"[ecstasy, ecstasy, ecstasy, ecstasy, ecstasy, ...","[ecstasy, hashish]"
3,b*****4,"[ecstasy, ecstasy, not_drugs, not_drugs, not_d...","[ecstasy, not_drugs]"
4,K*****z,"[ecstasy, ecstasy]",[ecstasy]


In [18]:
#Define a function to get all the market ids for each buyer
def getMarketOfBuyer(buyer):
    
    buyer_subset = buyer_and_drug[buyer_and_drug['buyer_name'] == buyer]

    market_id_list = list(buyer_subset['market_id'])
    
    return market_id_list

In [19]:
#Get all the list of market ids for each buyer
market_list = []

for buyer in unique_buyers:
    market_number = getMarketOfBuyer(buyer)
    market_list.append(market_number)

In [20]:
#fill dataframe with market ids for each buyer
buyer_df['market_number'] = market_list

In [21]:
def getUniqueMarket(m_list):
    unique_markets = list(set(m_list))
    
    return unique_markets

In [22]:
buyer_df.head()

Unnamed: 0,buyer_name,list_of_drugs,unique_drug_predictions,market_number
0,P*****d,"[ecstasy, ecstasy, lsd, ecstasy, ecstasy, hero...","[lsd, ecstasy, not_drugs, heroin]","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
1,B*****u,"[ecstasy, methamphetamine, methamphetamine, me...","[ecstasy, marijuana, psychedelic, lsd, methamp...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
2,z*****d,"[ecstasy, ecstasy, ecstasy, ecstasy, ecstasy, ...","[ecstasy, hashish]","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]"
3,b*****4,"[ecstasy, ecstasy, not_drugs, not_drugs, not_d...","[ecstasy, not_drugs]","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
4,K*****z,"[ecstasy, ecstasy]",[ecstasy],"[3, 3]"


In [23]:
#Get all the unique drugs for each buyer
buyer_df['unique_market'] = buyer_df.iloc[:,3].apply(getUniqueMarket)

In [24]:
buyer_df.head()

Unnamed: 0,buyer_name,list_of_drugs,unique_drug_predictions,market_number,unique_market
0,P*****d,"[ecstasy, ecstasy, lsd, ecstasy, ecstasy, hero...","[lsd, ecstasy, not_drugs, heroin]","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",[3]
1,B*****u,"[ecstasy, methamphetamine, methamphetamine, me...","[ecstasy, marijuana, psychedelic, lsd, methamp...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",[3]
2,z*****d,"[ecstasy, ecstasy, ecstasy, ecstasy, ecstasy, ...","[ecstasy, hashish]","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]",[3]
3,b*****4,"[ecstasy, ecstasy, not_drugs, not_drugs, not_d...","[ecstasy, not_drugs]","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",[3]
4,K*****z,"[ecstasy, ecstasy]",[ecstasy],"[3, 3]",[3]


In [25]:
# buyer_df.to_csv(r'buyer_df.csv', index = False)

## Check for relation in buyer name and buying habits


In [26]:
#Get the first and last letter for each buyer
def getCharacters(vendor_name):
    first_letter = vendor_name[0]
    last_letter = vendor_name[-1]
    
    return first_letter + last_letter

In [27]:
#Get all the first and last letters of each buyer
buyer_df['buyer_ID'] = buyer_df.iloc[:,0].apply(getCharacters)

In [28]:
len(buyer_df)

4400

In [29]:
#Make copy of dataframe with just buyer id and unique purchases
id_drug_df = copy.deepcopy(buyer_df.iloc[:,[5,2]])

id_drug_df.columns = ['Buyer_ID', 'Unique_Buys']

In [30]:
id_drug_df = id_drug_df.sort_values(by=['Buyer_ID'])

In [31]:
id_drug_df.to_csv(r'id_drug.csv', index = False)

### Note: did not see many similarities

## Investigate Drug Co-Occurence


In [32]:
#Get the list unique drugs for each buyer
drug_series = buyer_df['unique_drug_predictions'].tolist()

In [33]:
from collections import Counter
from itertools import combinations

counter  = Counter()

In [34]:
#find the most common pairings of each list
for sub in drug_series:
    if len(drug_series) < 2:
        continue
    sub.sort()
    for comb in combinations(sub,2):
        counter[comb] += 1

In [35]:
#Print the most common pairings
most_common = counter.most_common()

most_common

[(('cocaine', 'marijuana'), 256),
 (('marijuana', 'not_drugs'), 246),
 (('', 'not_drugs'), 222),
 (('ecstasy', 'marijuana'), 221),
 (('', 'marijuana'), 199),
 (('cocaine', 'ecstasy'), 197),
 (('marijuana', 'methamphetamine'), 166),
 (('ecstasy', 'not_drugs'), 163),
 (('hashish', 'marijuana'), 140),
 (('cocaine', 'methamphetamine'), 138),
 (('', 'ecstasy'), 135),
 (('lsd', 'not_drugs'), 127),
 (('cocaine', 'not_drugs'), 124),
 (('', 'cocaine'), 122),
 (('ecstasy', 'methamphetamine'), 117),
 (('lsd', 'marijuana'), 117),
 (('ecstasy', 'lsd'), 107),
 (('benzodiazepines', 'marijuana'), 102),
 (('ecstasy', 'ketamine'), 91),
 (('cocaine', 'hashish'), 83),
 (('', 'methamphetamine'), 81),
 (('marijuana', 'psilocybin'), 81),
 (('benzodiazepines', 'cocaine'), 80),
 (('methamphetamine', 'not_drugs'), 79),
 (('cocaine', 'lsd'), 79),
 (('ketamine', 'marijuana'), 75),
 (('marijuana', 'stimulant'), 74),
 (('', 'lsd'), 73),
 (('cocaine', 'ketamine'), 73),
 (('benzodiazepines', 'ecstasy'), 72),
 (('ecst

In [36]:
market_co = pd.DataFrame(most_common)

In [37]:
#Export as csv
#market_co.to_csv('market_co.csv')

In [38]:
#saving in a file
import pickle
file=open('market_co' , 'wb')
pickle.dump(market_co, file)
file.close()