In [105]:
import pandas as pd
import plotly.express as px

In [106]:
data = pd.read_csv('drug200.csv')
data

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [None]:
data = data.replace('DrugY', 'drugY')
data

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [108]:
data['Drug'].value_counts()

Drug
drugY    91
drugX    54
drugA    23
drugC    16
drugB    16
Name: count, dtype: int64

In [144]:
px.scatter(data_frame=data.groupby('Sex').get_group('F'),
           color='Drug',
           x='Age',
           y='Na_to_K',
           title='Age vs Na_to_K by Drug',
           labels={'Age': 'Age (years)', 'Na_to_K': 'Sodium to Potassium Ratio'},
           opacity=0.86)

### On the violin plot we got more information about the behavior of each drug effect at Na/K level

Obs: As we'll see, there're no correlation with any drugs but 'drugY' with high concentration of Na per K

In [111]:
px.violin(data_frame=data,
          x='Drug',
          y='Na_to_K',
          color='Sex',
          title='Sodium to Potassium Ratio by Drug',
          labels={'Na_to_K': 'Sodium to Potassium Ratio'})

In [112]:
px.pie(data_frame=data,
         names='Drug',
         title='Distribution of Drugs',
         labels={'Drug': 'Type of Drug'},
         hole=0.3)

### Ratio of High Cholesterol

In [113]:
high_cholesterol = data[data['Cholesterol'] == 'HIGH']

ratio_drug_cholesterol = high_cholesterol['Drug'].value_counts()/data['Drug'].value_counts()
ratio_drug_cholesterol     


Drug
drugA    0.521739
drugB    0.500000
drugC    1.000000
drugX    0.370370
drugY    0.516484
Name: count, dtype: float64

In [114]:
px.bar(data_frame=ratio_drug_cholesterol,
       x=ratio_drug_cholesterol.index,
       y=ratio_drug_cholesterol.values,
       title='Ratio of High Cholesterol by Drug',
       labels={'x': 'Drug', 'y': 'Ratio of High Cholesterol'},
       color=ratio_drug_cholesterol.index)

In [115]:
high_bp = data[data['BP'] == 'HIGH']
ratio_drug_highbp = high_bp['Drug'].value_counts()/ data['Drug'].value_counts()
ratio_drug_highbp

Drug
drugA    1.000000
drugB    1.000000
drugC         NaN
drugX         NaN
drugY    0.417582
Name: count, dtype: float64

### Ratio of High Blood Pressure

In [116]:
px.bar(data_frame=ratio_drug_highbp,
         x=ratio_drug_highbp.index,
         y=ratio_drug_highbp.values,
         title='Ratio of High Blood Pressure by Drug',
         labels={'x': 'Drug', 'y': 'Ratio of High Blood Pressure'},
         color=ratio_drug_highbp.index)

### Ratio of Low Blood Pressure

In [117]:
low_bp = data[data['BP'] == 'LOW']

ratio_drug_lowbp = low_bp['Drug'].value_counts()/ data['Drug'].value_counts()
ratio_drug_lowbp

Drug
drugA         NaN
drugB         NaN
drugC    1.000000
drugX    0.333333
drugY    0.329670
Name: count, dtype: float64

In [118]:
px.bar(data_frame=ratio_drug_lowbp,
         x=ratio_drug_lowbp.index,
         y=ratio_drug_lowbp.values,
         title='Ratio of Low Blood Pressure by Drug',
         labels={'x': 'Drug', 'y': 'Ratio of Low Blood Pressure'},
         color=ratio_drug_lowbp.index)

# Linear Regresion 

### Now we're gonna study if there's a correlation between DrugY and Na_to_K ratio

In [127]:
px.scatter(data_frame=data.groupby('Drug').get_group('drugY'),
           x='Age',
           y='Na_to_K',
           color='Drug',
           title='Age vs Sodium to Potassium Ratio for High Cholesterol Patients')

#### OBS: Only seeing this scatter we can supose a bad/weak correlation, but we've to proove it!

In [130]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import plotly.graph_objects as go

In [129]:
data_dy = data[data['Drug'] == 'drugY']

In [134]:
X = data_dy[['Age']]
y = data_dy['Na_to_K']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = SGDRegressor(max_iter=1000, tol=1e-3, random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)

print(f'Coefficients: {model.coef_[0]:.4f}')
print(f'Intercept: {model.intercept_[0]:.4f}')
print(f'Mean Squared Error: {mse:.4f}')
print(f'RMSE: {mse**0.5:.4f}')

Coefficients: -0.2200
Intercept: 21.9150
Mean Squared Error: 35.8358
RMSE: 5.9863


### As we can see, the low value of Coefficients and high MSE shows us that the data are literaly scattered, so there's not a valuable correlation between theses two points. As we expect, the linear regression won't fit on the graph.

In [138]:
fig = px.scatter(data_dy, x='Age', y='Na_to_K',
                 title="Na_to_K vs Age",
                 labels={"Age": "Age", "Na_to_K": "Na_to_K"},
                 opacity=0.6)
x_vals = data_dy['Age']
x_scaled = scaler.transform(data_dy[['Age']]).flatten()
y_pred = model.intercept_[0] + x_scaled * model.coef_[0]
fig.add_trace(go.Scatter(x=x_vals, y=y_pred, mode='lines',
                         name='SGD Regression Line',
                         line=dict(color='black', dash='dash')))

### Now, the same with drugX

In [141]:
data_dx = data[data['Drug'] == 'drugX']

X_dx = data_dx[['Age']]
y_dx = data_dx['Na_to_K']

X_train_dx, X_test_dx, y_train_dx, y_test_dx = train_test_split(X_dx, y_dx, test_size=0.2, random_state=42)

scaler_dx = StandardScaler()
X_train_dx_scaled = scaler_dx.fit_transform(X_train_dx)
X_test_dx_scaled = scaler_dx.transform(X_test_dx)

model_dx = SGDRegressor(max_iter=1000, tol=1e-3, random_state=42)
model_dx.fit(X_train_dx_scaled, y_train_dx)

y_pred_dx = model_dx.predict(X_test_dx_scaled)
mse_dx = mean_squared_error(y_test_dx, y_pred_dx)

print(f'Coefficients for drugX: {model_dx.coef_[0]:.4f}')
print(f'Intercept for drugX: {model_dx.intercept_[0]:.4f}')
print(f'Mean Squared Error for drugX: {mse_dx:.4f}')
print(f'RMSE for drugX: {mse_dx**0.5:.4f}')

Coefficients for drugX: 0.1906
Intercept for drugX: 10.4480
Mean Squared Error for drugX: 3.6496
RMSE for drugX: 1.9104


In [143]:
fig = px.scatter(data_dx, x='Age', y='Na_to_K',
                 title="Na_to_K vs Age",
                 labels={"Age": "Age", "Na_to_K": "Na_to_K"},
                 opacity=0.6)
x_vals = data_dx['Age']
x_scaled = scaler_dx.transform(data_dx[['Age']]).flatten()
y_pred = model_dx.intercept_[0] + x_scaled * model_dx.coef_[0]

fig.add_trace(go.Scatter(x=x_vals, y=y_pred, mode='lines',
                         name='SGD Regression Line',
                         line=dict(color='black', dash='dash')))