In [1]:
import pandas as pd
import re

In [2]:
drug_s = '../data/drug_seizures.csv'
try:
    d_s = pd.read_csv(drug_s, sep=';')
    pd.set_option('display.max_columns', None)
    print(d_s.head())
except pd.errors.ParserError as e:
    print(f"Error parsing CSV file: {e}")

        country                                     drug_group  year      kg
0  South Africa                        Ecstasy type substances  2021    1,00
1  South Africa                        Ecstasy type substances  2021    0,00
2  South Africa                        Ecstasy type substances  2021    0,10
3  South Africa  Amphetamine-type stimulants excluding ecstasy  2021   20,00
4  South Africa  Amphetamine-type stimulants excluding ecstasy  2021  459,91


In [3]:
d_s.head()

Unnamed: 0,country,drug_group,year,kg
0,South Africa,Ecstasy type substances,2021,100
1,South Africa,Ecstasy type substances,2021,0
2,South Africa,Ecstasy type substances,2021,10
3,South Africa,Amphetamine-type stimulants excluding ecstasy,2021,2000
4,South Africa,Amphetamine-type stimulants excluding ecstasy,2021,45991


In [4]:
d_s.shape

(3660, 4)

In [5]:
d_s.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3660 entries, 0 to 3659
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   country     3660 non-null   object
 1   drug_group  3660 non-null   object
 2   year        3660 non-null   int64 
 3   kg          3635 non-null   object
dtypes: int64(1), object(3)
memory usage: 774.8 KB


In [6]:
 unique_values = d_s['country'].unique()
 print(unique_values)

['South Africa' 'Antigua and Barbuda' 'Jamaica' 'Trinidad and Tobago'
 'Belize' 'Costa Rica' 'Canada' 'Mexico' 'Argentina' 'Chile' 'Colombia'
 'Ecuador' 'Uruguay' 'Thailand' 'Israel' 'Croatia' 'Austria' 'Belgium'
 'Czechia' 'Italy' 'Luxembourg' 'Malta' 'Portugal' 'Slovenia' 'Spain'
 'Switzerland' 'Australia']


In [7]:
list = ['Antigua and Barbuda', 'Argentina', 'Australia', 'Austria', 'Belgium', 'Belize', 'Bermuda', 'Canada', 'Chile', 'Colombia', 'Costa Rica', 'Croatia', 'Czech Republic', 'Dominica', 'Ecuador', 'Israel', 'Italy', 'Jamaica', 'Luxembourg', 'Malta', 'Mexico', 'Portugal', 'Slovenia', 'South Africa', 'Spain', 'Switzerland', 'Thailand', 'Trinidad and Tobago', 'Uruguay']

In [8]:
non_duplicate_values = d_s.loc[~d_s['country'].isin(list), 'country'].unique()
print(non_duplicate_values)

['Czechia']


In [9]:
 unique_values = d_s['drug_group'].unique()
 print(unique_values)

['Ecstasy type substances' 'Amphetamine-type stimulants excluding ecstasy'
 'Cannabis-type drugs (excluding synthetic cannabinoids)' 'Cocaine-type'
 'Hallucinogens' 'NPS' 'Opioids' 'Sedatives and tranquillizers'
 'Precursors' 'Any other drugs/substances' 'Solvents and Inhalants']


In [10]:
null_rows = d_s[d_s['kg'].isnull()]

print(null_rows)

                  country                                         drug_group  \
68    Trinidad and Tobago  Cannabis-type drugs (excluding synthetic canna...   
69    Trinidad and Tobago  Cannabis-type drugs (excluding synthetic canna...   
1388              Uruguay      Amphetamine-type stimulants excluding ecstasy   
1397              Uruguay                                      Hallucinogens   
2351           Luxembourg                            Ecstasy type substances   
2353           Luxembourg                         Any other drugs/substances   
2354           Luxembourg                         Any other drugs/substances   
2355           Luxembourg                         Any other drugs/substances   
2357           Luxembourg                         Any other drugs/substances   
2358           Luxembourg                         Any other drugs/substances   
2360           Luxembourg                         Any other drugs/substances   
2362           Luxembourg               

In [11]:
d_s.dropna(subset=['kg'], inplace=True)

In [12]:
d_s['drug_group'] = d_s['drug_group'].str.lower()

In [13]:
d_s.head()

Unnamed: 0,country,drug_group,year,kg
0,South Africa,ecstasy type substances,2021,100
1,South Africa,ecstasy type substances,2021,0
2,South Africa,ecstasy type substances,2021,10
3,South Africa,amphetamine-type stimulants excluding ecstasy,2021,2000
4,South Africa,amphetamine-type stimulants excluding ecstasy,2021,45991


In [14]:
d_s = d_s.applymap(lambda x: re.sub(r'[^\w\s]', '', str(x)))
d_s.head()

Unnamed: 0,country,drug_group,year,kg
0,South Africa,ecstasy type substances,2021,100
1,South Africa,ecstasy type substances,2021,0
2,South Africa,ecstasy type substances,2021,10
3,South Africa,amphetaminetype stimulants excluding ecstasy,2021,2000
4,South Africa,amphetaminetype stimulants excluding ecstasy,2021,45991


In [15]:
d_s['drug_group'] = d_s['drug_group'].str.split().str[0]
d_s.head()

Unnamed: 0,country,drug_group,year,kg
0,South Africa,ecstasy,2021,100
1,South Africa,ecstasy,2021,0
2,South Africa,ecstasy,2021,10
3,South Africa,amphetaminetype,2021,2000
4,South Africa,amphetaminetype,2021,45991


In [16]:
d_s.shape

(3635, 4)

In [17]:
d_s = d_s[~d_s['kg'].str.contains('^0{1,3}$')]
d_s.head(20)

Unnamed: 0,country,drug_group,year,kg
0,South Africa,ecstasy,2021,100
2,South Africa,ecstasy,2021,10
3,South Africa,amphetaminetype,2021,2000
4,South Africa,amphetaminetype,2021,45991
5,South Africa,amphetaminetype,2021,600
6,South Africa,cannabistype,2021,6978
7,South Africa,cannabistype,2021,1000
8,South Africa,cannabistype,2021,34460
9,South Africa,cocainetype,2021,34
10,South Africa,cocainetype,2021,528679


In [18]:
start = 1

d_s.insert(0, 'drug_id', range(start, start + len(d_s)))
d_s.head()

Unnamed: 0,drug_id,country,drug_group,year,kg
0,1,South Africa,ecstasy,2021,100
2,2,South Africa,ecstasy,2021,10
3,3,South Africa,amphetaminetype,2021,2000
4,4,South Africa,amphetaminetype,2021,45991
5,5,South Africa,amphetaminetype,2021,600


In [19]:
d_s.to_csv('drug_seizure.csv', index=False)

In [20]:
d_s = d_s.drop('drug_id', axis=1)
d_s.shape

(2463, 4)

In [21]:
d_s = d_s.drop_duplicates()

d_s.shape

(2314, 4)

In [22]:
start = 1

d_s.insert(0, 'drug_id', range(start, start + len(d_s)))
d_s.head()

Unnamed: 0,drug_id,country,drug_group,year,kg
0,1,South Africa,ecstasy,2021,100
2,2,South Africa,ecstasy,2021,10
3,3,South Africa,amphetaminetype,2021,2000
4,4,South Africa,amphetaminetype,2021,45991
5,5,South Africa,amphetaminetype,2021,600


In [23]:
 unique_values = d_s['kg'].unique()
 print(unique_values)

['100' '010' '2000' ... '26400' '533300' '126000']


In [24]:
d_s['kg'] = d_s['kg'].astype(int)
d_s.head()

Unnamed: 0,drug_id,country,drug_group,year,kg
0,1,South Africa,ecstasy,2021,100
2,2,South Africa,ecstasy,2021,10
3,3,South Africa,amphetaminetype,2021,2000
4,4,South Africa,amphetaminetype,2021,45991
5,5,South Africa,amphetaminetype,2021,600


In [25]:
d_s.to_csv('drug_seizure.csv', index=False)