In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report

pd.set_option('display.max_columns', None) #To set no limit to the number of columns displayed

In [None]:
data = pd.read_csv('/content/Drug_Consumption_Quantified.csv')
data = data.drop('ID', axis=1) #Extra data (not necassary) => axis = 1 to drop coloumns, for droping rows axis = 0
data.head()

FileNotFoundError: ignored

In [None]:
data.isna().sum().sum() #To check for empty data, .sum().sum() returns the total number of Not a Number (NaN) values in the entire DataFrame

In [None]:
print(f'Original shape of data with {data.shape[0]} rows and {data.shape[1]} columns') #f string is used to format the string, data.shape[0] is the value of number of rows and data.shape[1] is value of number of coloumns

In [None]:
data = data.drop(['Choc','Semer'], axis=1) #Choc (Chocolate) and Semer are fake drugs, so we are dropping them
data = data.reset_index(drop=True) #resets the index value after dropping choc and semer

In [None]:
data.head()

In [None]:
drugs = ['Alcohol','Amyl','Amphet','Benzos','Caff','Cannabis','Coke','Crack','Ecstasy','Heroin','Ketamine','Legalh','LSD','Meth','Mushrooms','Nicotine','VSA'    ]


*   CL0 = No drug use
*   CL1 = Experimental or occasional use
*   CL2 = Frequent use
*   CL3 or above (CL4, CL5,CL6) = Problematic usage

In [None]:
#Ordinal Data, so we are assigning a integer to each value, for the machine to process it well
def drug_condition(x):
    if x == 'CL0':
        return 0
    elif x == 'CL1':
        return 1
    elif x == 'CL2':
        return 2
    elif x == 'CL3':
        return 3
    elif x == 'CL4':
        return 4
    elif x == 'CL5':
        return 5
    elif x == 'CL6':
        return 6
    else:
        return 7

In [None]:
for column in drugs:
    data[column] = data[column].apply(drug_condition)

In [None]:
data.head()


In [None]:
data.Education.unique()

In [None]:
sns.barplot(data=data, x='Age', y='Coke', label='Coke')
plt.show()
sns.barplot(data=data, x='Age', y='Heroin', label='Heroin')
plt.show()
sns.barplot(data=data, x='Age', y='Meth', label='Meth')
plt.legend()
plt.show()

In [None]:
sns.barplot(data=data, x='Education', y='Coke', label='Coke')
plt.show()
sns.barplot(data=data, x='Education', y='Meth', label='Meth')
plt.show()
sns.barplot(data=data, x='Education', y='Heroin', label='Heroin')
plt.ylabel('Drug Type')
plt.xlabel('Education')
plt.legend()
plt.show()

In [None]:
sns.barplot(data=data, x='Gender', y='Coke', label='Coke')
plt.show()
sns.barplot(data=data, x='Gender', y='Heroin', label='Heroin')
plt.show()
sns.barplot(data=data, x='Gender', y='Meth', label='Meth')
plt.ylabel('Drug Type')
plt.xlabel('Gender')
plt.legend()
plt.show()

In [None]:
sns.barplot(data=data, x='Country', y='Coke', label='Coke')
plt.show()
sns.barplot(data=data, x='Country', y='Heroin', label='Heroin')
plt.show()
sns.barplot(data=data, x='Country', y='Meth', label='Meth')
plt.ylabel('Drug Type')
plt.xlabel('Country')
plt.legend()
plt.show()

In [None]:
sns.barplot(data=data, x='Ethnicity', y='Coke', label='Coke')
plt.show()
sns.barplot(data=data, x='Ethnicity', y='Heroin', label='Heroin')
plt.show()
sns.barplot(data=data, x='Ethnicity', y='Meth', label='Meth')
plt.ylabel('Drug Type')
plt.xlabel('Ethnicity')
plt.legend()
plt.show()

According to the graphs, Age, Gender, Education, Country and Ethinicity cannot be ignored or be droped

In [None]:
# if 1 is shown in the column other than 0 then the individual uses the drug

cocaine_df = data.copy()
#assigning new column for cocaine+-
cocaine_df['coke_user'] = cocaine_df['Coke'].apply(lambda x: 0.5 if x not in [0,1] else 0)
cocaine_df['crack_user'] = cocaine_df['Crack'].apply(lambda x: 0.5 if x not in [0,1] else 0)
cocaine_df['both_user'] = cocaine_df[['coke_user', 'crack_user']].iloc[:].sum(axis=1)
# Cocaine = Crack + Coke
cocaine_df['Cocaine_User'] = cocaine_df['both_user'].apply(lambda x: 1 if x > 0 else 0)
#droping uneccassry column
cocaine_df = cocaine_df.drop(['coke_user', 'crack_user', 'both_user' ], axis=1)

In [None]:
meth_df = data.copy()
#assigning new column for meth

meth_df['Meth_User'] = meth_df['Meth'].apply(lambda x: 1 if x not in [0,1] else 0)
#droping uneccassry column
meth_df = meth_df.drop(['Meth'], axis=1)

In [None]:
heroin_df = data.copy()
#assigning new column for heroin
heroin_df['Heroin_User'] = heroin_df['Heroin'].apply(lambda x: 1 if x not in [0,1] else 0)
#droping uneccassry column
heroin_df = heroin_df.drop(['Heroin'], axis=1)

In [None]:
nic_df = data.copy()
#assigning new column for nicotine
nic_df['Nicotine_User'] = nic_df['Nicotine'].apply(lambda x: 1 if x not in [0,1] else 0)
#droping uneccassry column
nic_df = nic_df.drop(['Nicotine'], axis=1)

In [None]:
cocaine_df.head(1)

In [None]:
meth_df.head(1)

In [None]:
heroin_df.head(1)

In [None]:
nic_df.head(1)

In [None]:
def preprocessing_inputs(df, column):
    df = df.copy()

    y = df[column]
    X = df.drop(column, axis=1)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns) #index = x_train.index is used to preseve the row labels and columns = X_train.columns for columns
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    return X_train, X_test, y_train, y_test

#Model Training



#Cocaine


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = preprocessing_inputs(cocaine_df, 'Cocaine_User')

print('Train set:', X_train.shape, y_train.shape)
print('Test set:', X_test.shape, y_test.shape)
print()

#KNN
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train,y_train)
y_pred = knn_model.predict(X_test)
knn_report = classification_report(y_test,y_pred)

print("---------------------Prediction with K-Nearest Neighbour---------------")
print(knn_report)

#Naive Bayes
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)

print('')
print('---------------------Prediction with Naive Bayes---------------------')
nb_report = classification_report(y_test,y_pred)
print(classification_report(y_test,y_pred))

#Logistic Regression
from sklearn.linear_model import LogisticRegression

lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train,y_train)
prediction = lg_model.predict(X_test)

print('')
print('-----------------Prediction with Logistic Regression-----------------')
lg_report = classification_report(y_test,prediction)
print(lg_report)

#SVM
from sklearn.svm import SVC

svm_model = SVC()
svm_model = svm_model.fit(X_train,y_train)
prediction = svm_model.predict(X_test)

print('')
print('---------------Prediction with Support Vector Machine----------------')
svm_report = classification_report(y_test,prediction)
print(svm_report)


def extract_accuracy(report):
    lines = report.split('\n')
    for line in lines:
        if 'accuracy' in line:
            return float(line.split()[-2])

# Calculate and print accuracy for each classifier
knn_accuracy = extract_accuracy(knn_report)
nb_accuracy = extract_accuracy(nb_report)
lg_accuracy = extract_accuracy(lg_report)
svm_accuracy = extract_accuracy(svm_report)

print('')
print(f'Accuracy for K-Nearest Neighbour   : {knn_accuracy:.2f}')
print(f'Accuracy for Naive Bayes           : {nb_accuracy:.2f}')
print(f'Accuracy for Logic Regression      : {lg_accuracy:.2f}')
print(f'Accuracy for Support Vector Machine: {svm_accuracy:.2f}')

highest_int = max(knn_accuracy, nb_accuracy, lg_accuracy, svm_accuracy)

# Print the highest integer
print(f"The highest integer is: {highest_int}")


#Meth

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = preprocessing_inputs(meth_df, 'Meth_User')

print('Train set:', X_train.shape, y_train.shape)
print('Test set:', X_test.shape, y_test.shape)
print()

#KNN
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train,y_train)
y_pred = knn_model.predict(X_test)
knn_report = classification_report(y_test,y_pred)

print("---------------------Prediction with K-Nearest Neighbour---------------")
print(knn_report)

#Naive Bayes
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)

print('')
print('---------------------Prediction with Naive Bayes---------------------')
nb_report = classification_report(y_test,y_pred)
print(classification_report(y_test,y_pred))

#Logistic Regression
from sklearn.linear_model import LogisticRegression

lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train,y_train)
prediction = lg_model.predict(X_test)

print('')
print('-----------------Prediction with Logistic Regression-----------------')
lg_report = classification_report(y_test,prediction)
print(lg_report)

#SVM
from sklearn.svm import SVC

svm_model = SVC()
svm_model = svm_model.fit(X_train,y_train)
prediction = svm_model.predict(X_test)

print('')
print('---------------Prediction with Support Vector Machine----------------')
svm_report = classification_report(y_test,prediction)
print(svm_report)


def extract_accuracy(report):
    lines = report.split('\n')
    for line in lines:
        if 'accuracy' in line:
            return float(line.split()[-2])

# Calculate and print accuracy for each classifier
knn_accuracy = extract_accuracy(knn_report)
nb_accuracy = extract_accuracy(nb_report)
lg_accuracy = extract_accuracy(lg_report)
svm_accuracy = extract_accuracy(svm_report)

print('')
print(f'Accuracy for K-Nearest Neighbour   : {knn_accuracy:.2f}')
print(f'Accuracy for Naive Bayes           : {nb_accuracy:.2f}')
print(f'Accuracy for Logic Regression      : {lg_accuracy:.2f}')
print(f'Accuracy for Support Vector Machine: {svm_accuracy:.2f}')

highest_int = max(knn_accuracy, nb_accuracy, lg_accuracy, svm_accuracy)

# Print the highest integer
print(f"The highest integer is: {highest_int}")


#Heroin

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = preprocessing_inputs(heroin_df, 'Heroin_User')

print('Train set:', X_train.shape, y_train.shape)
print('Test set:', X_test.shape, y_test.shape)
print()

#KNN
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train,y_train)
y_pred = knn_model.predict(X_test)
knn_report = classification_report(y_test,y_pred)

print("---------------------Prediction with K-Nearest Neighbour---------------")
print(knn_report)

#Naive Bayes
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)

print('')
print('---------------------Prediction with Naive Bayes---------------------')
nb_report = classification_report(y_test,y_pred)
print(classification_report(y_test,y_pred))

#Logistic Regression
from sklearn.linear_model import LogisticRegression

lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train,y_train)
prediction = lg_model.predict(X_test)

print('')
print('-----------------Prediction with Logistic Regression-----------------')
lg_report = classification_report(y_test,prediction)
print(lg_report)

#SVM
from sklearn.svm import SVC

svm_model = SVC()
svm_model = svm_model.fit(X_train,y_train)
prediction = svm_model.predict(X_test)

print('')
print('---------------Prediction with Support Vector Machine----------------')
svm_report = classification_report(y_test,prediction)
print(svm_report)


def extract_accuracy(report):
    lines = report.split('\n')
    for line in lines:
        if 'accuracy' in line:
            return float(line.split()[-2])

# Calculate and print accuracy for each classifier
knn_accuracy = extract_accuracy(knn_report)
nb_accuracy = extract_accuracy(nb_report)
lg_accuracy = extract_accuracy(lg_report)
svm_accuracy = extract_accuracy(svm_report)

print('')
print(f'Accuracy for K-Nearest Neighbour   : {knn_accuracy:.2f}')
print(f'Accuracy for Naive Bayes           : {nb_accuracy:.2f}')
print(f'Accuracy for Logic Regression      : {lg_accuracy:.2f}')
print(f'Accuracy for Support Vector Machine: {svm_accuracy:.2f}')

highest_int = max(knn_accuracy, nb_accuracy, lg_accuracy, svm_accuracy)

# Print the highest integer
print(f"The highest integer is: {highest_int}")


#Nicotine

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = preprocessing_inputs(nic_df, 'Nicotine_User')

print('Train set:', X_train.shape, y_train.shape)
print('Test set:', X_test.shape, y_test.shape)
print()

#KNN
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train,y_train)
y_pred = knn_model.predict(X_test)
knn_report = classification_report(y_test,y_pred)

print("---------------------Prediction with K-Nearest Neighbour---------------")
print(knn_report)

#Naive Bayes
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)

print('')
print('---------------------Prediction with Naive Bayes---------------------')
nb_report = classification_report(y_test,y_pred)
print(classification_report(y_test,y_pred))

#Logistic Regression
from sklearn.linear_model import LogisticRegression

lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train,y_train)
prediction = lg_model.predict(X_test)

print('')
print('-----------------Prediction with Logistic Regression-----------------')
lg_report = classification_report(y_test,prediction)
print(lg_report)

#SVM
from sklearn.svm import SVC

svm_model = SVC()
svm_model = svm_model.fit(X_train,y_train)
prediction = svm_model.predict(X_test)

print('')
print('---------------Prediction with Support Vector Machine----------------')
svm_report = classification_report(y_test,prediction)
print(svm_report)


def extract_accuracy(report):
    lines = report.split('\n')
    for line in lines:
        if 'accuracy' in line:
            return float(line.split()[-2])

# Calculate and print accuracy for each classifier
knn_accuracy = extract_accuracy(knn_report)
nb_accuracy = extract_accuracy(nb_report)
lg_accuracy = extract_accuracy(lg_report)
svm_accuracy = extract_accuracy(svm_report)

print('')
print(f'Accuracy for K-Nearest Neighbour   : {knn_accuracy:.2f}')
print(f'Accuracy for Naive Bayes           : {nb_accuracy:.2f}')
print(f'Accuracy for Logic Regression      : {lg_accuracy:.2f}')
print(f'Accuracy for Support Vector Machine: {svm_accuracy:.2f}')

highest_int = max(knn_accuracy, nb_accuracy, lg_accuracy, svm_accuracy)

# Print the highest integer
print(f"The highest integer is: {highest_int}")
