# PROJECT PYTHON FOR DATA ANALYSIS
## Drug Consumptions

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly

### Dataset preparation

In [None]:
demographic_columns = ["Age","Gender","Education","Country","Ethnicity"]
personality_columns = ["Nscore","Escore","Oscore","Ascore","Cscore","Impulsive","SS"]
feature_columns = demographic_columns + personality_columns
drugs_columns = ["Alcohol","Amphetamines","Amyl nitrite","Benzodiazepine","Caffeine","Cannabis","Chocolate","Cocaine","Crack","Ecstasy","Heroin","Ketamine","Legal highs","LSD","Methadone","Mushrooms","Nicotine","Fictitious drug Semeron","Volatile substance abuse"]
drugs_legal = ['Alcohol', 'Caffeine', 'Chocolate', 'Nicotine']
drugs_illegal = [drug for drug in drugs_columns if drug not in drugs_legal]
all_columns = feature_columns + drugs_columns

### Loading the dataset

In [None]:
df = pd.read_csv("drug_consumption.data",names=all_columns,header=None)
df.reset_index(drop=True,inplace=True)
df

### Nominal drug to ordinal data

In [None]:
for i in drugs_columns:
    df[i] = df[i].map({'CL0': 0, 'CL1': 1, 'CL2': 2, 'CL3': 3, 'CL4': 4, 'CL5': 5, 'CL6': 6})
df.head()

### Removing Semeron users

In [None]:
semerons = df[df['Fictitious drug Semeron'] != 0]
df_noFake = df[df['Fictitious drug Semeron'] == 0]
# Removing it from drug lists
drugs_columns.remove('Fictitious drug Semeron')
drugs_illegal.remove('Fictitious drug Semeron')
#Dropping the column from the dataset
df_noFake = df_noFake.drop(columns='Fictitious drug Semeron')
df_noFake.reset_index(drop=True,inplace=True)
df_noFake.shape
df_noFake

## Machine Learning Models

### Binarization of the outputs

In [None]:
# Binarization for user/non-user
outputs = df_noFake[drugs_columns]
X = df_noFake.copy().drop(outputs, axis=1)

bin_outputs = outputs.copy()
for i in range(len(bin_outputs)):
    for drug in drugs_columns:
        if bin_outputs.loc[i,drug]>2:
            bin_outputs.loc[i,drug]=1
        else:
            bin_outputs.loc[i,drug]=0
bin_outputs   

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn import metrics

def IsUserPrediction_LogReg(X,drug_name):
    # For 1 drug at a time. Need to manually change the value
    X_train,X_test,y_train,y_test=train_test_split(X,outputs[drug_name].values,test_size=0.2,random_state=9)

    reg_log = LogisticRegression()
    reg_log.fit(X_train, y_train)
    y_pred = reg_log.predict(X_test)
    cm = metrics.confusion_matrix(y_test, y_pred)
    score_cm = accuracy_score(y_test, y_pred)
    plt.figure(figsize=(9,9))
    sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap = 'Blues_r')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.title(f"Is a(n) {drug_name} user? \nAccuracy score: {score_cm*100:.2f}%")
    return score_cm
    
    
IsUserPrediction_LogReg(X,"LSD")
   

### LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

def IsUserPrediction_LDA(X,drug_name):
 # For 1 drug at a time. Need to manually change the value
    X_train,X_test,y_train,y_test=train_test_split(X,outputs[drug_name].values,test_size=0.2,random_state=9)

    clf = LDA()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    cm = metrics.confusion_matrix(y_test, y_pred)
    score_cm = accuracy_score(y_test, y_pred)
    plt.figure(figsize=(9,9))
    sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap = 'Blues_r')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.title(f"Is a(n) {drug_name} user? \nAccuracy score: {score_cm*100:.2f}%")
    return score_cm

IsUserPrediction_LDA(X,"LSD")

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

def IsUserPrediction_RFC(X,drug_name):
 # For 1 drug at a time. Need to manually change the value
    X_train,X_test,y_train,y_test=train_test_split(X,outputs[drug_name].values,test_size=0.2,random_state=9)

    clf = RandomForestClassifier(n_estimators=200,random_state=1)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    cm = metrics.confusion_matrix(y_test, y_pred)
    score_cm = accuracy_score(y_test, y_pred)
    plt.figure(figsize=(9,9))
    sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap = 'Blues_r')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.title(f"Is a(n) {drug_name} user? \nAccuracy score: {score_cm*100:.2f}%")
    return score_cm

IsUserPrediction_RFC(X,"Heroin")



### SVC

In [None]:
from sklearn import svm

def IsUserPrediction_SVC(X,drug_name):
 # For 1 drug at a time. Need to manually change the value
    X_train,X_test,y_train,y_test=train_test_split(X,outputs[drug_name].values,test_size=0.2,random_state=9)

    clf = svm.SVC()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    cm = metrics.confusion_matrix(y_test, y_pred)
    score_cm = accuracy_score(y_test, y_pred)
    plt.figure(figsize=(9,9))
    sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap = 'Blues_r')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.title(f"Is a(n) {drug_name} user? \nAccuracy score: {score_cm*100:.2f}%")
    return score_cm

IsUserPrediction_SVC(X,"Heroin")

### KNN
#### Adding a new variable: is the individual an illegal drug user or not?

In [None]:
illegal_drug_var = [1 if sum(df[drugs_illegal].iloc[i] != 0) > 3 else 0 for i in range(len(df))]
df['Illegal_user'] = illegal_drug_var

In [None]:
import plotly.express as px
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Load and split data
outputs = df[drugs_columns]
X = df.copy().drop(outputs, axis=1)
X.drop(["Illegal_user"],axis=1, inplace=True)
# For 1 drug at a time. Need to manually change the value
X_train,X_test,y_train,y_test=train_test_split(X,df.Illegal_user,test_size=0.2,random_state=9)

# Fit the model on training data, predict on test data
clf = KNeighborsClassifier(10)
clf.fit(X_train, y_train)
y_score = clf.predict_proba(X_test)[:, 1]

fig = px.scatter(
    X_test.Age,
    color=y_score, color_continuous_scale='RdBu',
    symbol=y_test, symbol_map={'0': 'square-dot', '1': 'circle-dot'},
    labels={'symbol': 'label', 'color': 'score of <br>first class'}
)
fig.update_traces(marker_size=12, marker_line_width=1.5)
fig.update_layout(legend_orientation='h')
fig.show()

#### PCA

In [None]:
import plotly.express as px
from sklearn.decomposition import PCA

outputs = df[drugs_columns]
X = df.copy().drop(outputs, axis=1)
X.drop(["Illegal_user"],axis=1, inplace=True)

# Change the number of principal components
pca = PCA(n_components=2)
components = pca.fit_transform(X)
 
total_var = pca.explained_variance_ratio_.sum() * 100
# 2 components
fig = px.scatter(components, x=0, y=1, color=df.Illegal_user,title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2'})

# 3 components
'''
fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=df.Illegal_user, 
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
'''
'''
pca.fit(df)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)
'''

fig.show()


In [None]:
contribs = pca.components_.T * np.sqrt(pca.explained_variance_)
contribs

In [None]:
pca.components_

### Multilabel Classification

In [None]:
from sklearn.datasets import make_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
import numpy as np
X, y1 = make_classification(n_samples=10, n_features=100, n_informative=30, n_classes=3, random_state=1)
y2 = shuffle(y1, random_state=1)
y3 = shuffle(y1, random_state=2)
Y = np.vstack((y1, y2, y3)).T
n_samples, n_features = X.shape # 10,100
n_outputs = Y.shape[1] # 3
n_classes = 3
forest = RandomForestClassifier(random_state=1)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)
multi_target_forest.fit(X, Y).predict(X)