# Medicine Details Analysis

## Your task is to analyze a dataset containing detailed information about over 11,000 medicines, including their salt compositions, uses, side effects, manufacturers, and user reviews. The goal is to uncover patterns and insights that can help improve decision-making in the healthcare industry and enhance patient outcomes.

In [103]:
#Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score)

from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

In [105]:
#Load the dataset
data = pd.read_csv('Medicine_Details.csv')
data.head(10)

Unnamed: 0,Medicine Name,Composition,Uses,Side_effects,Image URL,Manufacturer,Excellent Review %,Average Review %,Poor Review %
0,Avastin 400mg Injection,Bevacizumab (400mg),Cancer of colon and rectum Non-small cell lun...,Rectal bleeding Taste change Headache Noseblee...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Roche Products India Pvt Ltd,22,56,22
1,Augmentin 625 Duo Tablet,Amoxycillin (500mg) + Clavulanic Acid (125mg),Treatment of Bacterial infections,Vomiting Nausea Diarrhea Mucocutaneous candidi...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Glaxo SmithKline Pharmaceuticals Ltd,47,35,18
2,Azithral 500 Tablet,Azithromycin (500mg),Treatment of Bacterial infections,Nausea Abdominal pain Diarrhea,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Alembic Pharmaceuticals Ltd,39,40,21
3,Ascoril LS Syrup,Ambroxol (30mg/5ml) + Levosalbutamol (1mg/5ml)...,Treatment of Cough with mucus,Nausea Vomiting Diarrhea Upset stomach Stomach...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Glenmark Pharmaceuticals Ltd,24,41,35
4,Aciloc 150 Tablet,Ranitidine (150mg),Treatment of Gastroesophageal reflux disease (...,Headache Diarrhea Gastrointestinal disturbance,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Cadila Pharmaceuticals Ltd,34,37,29
5,Allegra 120mg Tablet,Fexofenadine (120mg),Treatment of Sneezing and runny nose due to al...,Headache Drowsiness Dizziness Nausea,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Sanofi India Ltd,35,42,23
6,Avil 25 Tablet,Pheniramine (25mg),Treatment of Allergic conditionsTreatment of R...,Sedation,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Sanofi India Ltd,40,34,26
7,Aricep 5 Tablet,Donepezil (5mg),Alzheimer's disease,Common cold Urinary incontinence Rash Nausea D...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Eisai Pharmaceuticals India Pvt Ltd,43,28,29
8,Amoxyclav 625 Tablet,Amoxycillin (500mg) + Clavulanic Acid (125mg),Treatment of Bacterial infections,Vomiting Nausea Diarrhea Mucocutaneous candidi...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Abbott,36,43,21
9,Atarax 25mg Tablet,Hydroxyzine (25mg),Treatment of AnxietyTreatment of Skin conditio...,Sedation Nausea Vomiting Upset stomach Constip...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Dr Reddy's Laboratories Ltd,35,41,24


## Data Preprocessing

In [108]:
#Check for Missing Values
print(data.isnull().sum())

Medicine Name         0
Composition           0
Uses                  0
Side_effects          0
Image URL             0
Manufacturer          0
Excellent Review %    0
Average Review %      0
Poor Review %         0
dtype: int64


In [110]:
#Drop Irrelevant Columns
data = data.drop(columns = ['Image URL'])
data.head()

Unnamed: 0,Medicine Name,Composition,Uses,Side_effects,Manufacturer,Excellent Review %,Average Review %,Poor Review %
0,Avastin 400mg Injection,Bevacizumab (400mg),Cancer of colon and rectum Non-small cell lun...,Rectal bleeding Taste change Headache Noseblee...,Roche Products India Pvt Ltd,22,56,22
1,Augmentin 625 Duo Tablet,Amoxycillin (500mg) + Clavulanic Acid (125mg),Treatment of Bacterial infections,Vomiting Nausea Diarrhea Mucocutaneous candidi...,Glaxo SmithKline Pharmaceuticals Ltd,47,35,18
2,Azithral 500 Tablet,Azithromycin (500mg),Treatment of Bacterial infections,Nausea Abdominal pain Diarrhea,Alembic Pharmaceuticals Ltd,39,40,21
3,Ascoril LS Syrup,Ambroxol (30mg/5ml) + Levosalbutamol (1mg/5ml)...,Treatment of Cough with mucus,Nausea Vomiting Diarrhea Upset stomach Stomach...,Glenmark Pharmaceuticals Ltd,24,41,35
4,Aciloc 150 Tablet,Ranitidine (150mg),Treatment of Gastroesophageal reflux disease (...,Headache Diarrhea Gastrointestinal disturbance,Cadila Pharmaceuticals Ltd,34,37,29


In [112]:
#Separate Features and Target Variable
x = data.drop('Excellent Review %', axis = 1)
y = data['Excellent Review %']

#Identify Categorical and Numerical Columns
categorical_cols = x.select_dtypes(include = ['object']).columns
numerical_cols = x.select_dtypes(include = ['float64', 'int64']).columns

#Data Preprocessing for Numerical Data
numerical_transformer = Pipeline(steps = [('scaler', StandardScaler())])

#Data Preprocessing for Categorical Data
categorical_transformer = Pipeline(steps = [('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

#Combine Numerical and Categorical Data
preprocessor = ColumnTransformer(
    transformers = [('num', numerical_transformer, numerical_cols), ('cat', categorical_transformer, categorical_cols)])

#Apply Transformations
x = preprocessor.fit_transform(x)

In [114]:
#Splitting the Dataset Into Training and Test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

## EDA

In [117]:
#General Statistics of Data
data.describe()

Unnamed: 0,Excellent Review %,Average Review %,Poor Review %
count,11825.0,11825.0,11825.0
mean,38.516025,35.756364,25.727611
std,25.225343,18.268134,23.991985
min,0.0,0.0,0.0
25%,22.0,27.0,0.0
50%,34.0,35.0,22.0
75%,51.0,47.0,35.0
max,100.0,88.0,100.0


## Machine Learning Models

In [120]:
#Linear Regression
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

#Make Predictions on the Test Set
lr_predict = lr_model.predict(x_test)

#Evaluate
print('Linear Regression Model: ')
print(f'Mean Absolute Error : {mean_absolute_error(y_test, lr_predict)}')
print(f'Mean Squared Error : {mean_squared_error(y_test, lr_predict)}')
print(f'R-Squared : {r2_score(y_test, lr_predict)}')

Linear Regression Model: 
Mean Absolute Error : 0.0005110958701100271
Mean Squared Error : 5.401887283915776e-07
R-Squared : 0.9999999991433933


In [122]:
#Decision Tree
dt_model = DecisionTreeRegressor()
dt_model.fit(x_train, y_train)

#Make Predictions
dt_predict = dt_model.predict(x_test)

#Evaluate
print('Decision Tree Model: ')
print(f'Mean Absolute Error : {mean_absolute_error(y_test, dt_predict)}')
print(f'Mean Squared Error : {mean_squared_error(y_test, dt_predict)}')
print(f'R-Squared : {r2_score(y_test, dt_predict)}')

Decision Tree Model: 
Mean Absolute Error : 0.15687103594080337
Mean Squared Error : 0.4427061310782241
R-Squared : 0.9992979767658952


In [124]:
#Random Forest
rfr_model = RandomForestRegressor()
rfr_model.fit(x_train, y_train)

#Make Predictions
rfr_predict = rfr_model.predict(x_test)

#Evaluate
print('Random Forest Regressor Model: ')
print(f'Mean Absolute Error : {mean_absolute_error(y_test, rfr_predict)}')
print(f'Mean Squared Error : {mean_squared_error(y_test, rfr_predict)}')
print(f'R-Squared : {r2_score(y_test, rfr_predict)}')

Random Forest Regressor Model: 
Mean Absolute Error : 0.16651585623678655
Mean Squared Error : 0.3226049471458774
R-Squared : 0.9994884277572981


## Deep Learning Models

In [127]:
#General info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11825 entries, 0 to 11824
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Medicine Name       11825 non-null  object
 1   Composition         11825 non-null  object
 2   Uses                11825 non-null  object
 3   Side_effects        11825 non-null  object
 4   Manufacturer        11825 non-null  object
 5   Excellent Review %  11825 non-null  int64 
 6   Average Review %    11825 non-null  int64 
 7   Poor Review %       11825 non-null  int64 
dtypes: int64(3), object(5)
memory usage: 739.2+ KB


In [129]:
#Data Preprocessing
features = data[['Excellent Review %', 'Average Review %', 'Poor Review %']]

#Scale data
scaler = MinMaxScaler(feature_range = (0, 1))
scaled_data = scaler.fit_transform(features)

In [131]:
#Prepare Training Data
def create_sequences(data, seq_length):
    xs, ys = [], []
    
    for i in range(len(data) - seq_length):
        x = data[i:i + seq_length]
        y = data[i + seq_length][2] 
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

seq_length = 60
x, y = create_sequences(scaled_data, seq_length)

In [133]:
#Split Data Into Training and Test Sets
split = int(0.8 * len(x))
x_train, x_test = x[:split], x[split:]
y_train, y_test = y[:split], y[split:]

In [135]:
#Building the Model
model = Sequential()
model.add(LSTM(units = 50, return_sequences = True, input_shape = (x_train.shape[1], x_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units = 50, return_sequences = False))
model.add(Dropout(0.2))
model.add(Dense(units = 1))

model.compile(optimizer = 'adam', loss = 'mean_squared_error')

  super().__init__(**kwargs)


In [137]:
#Training the Model
history = model.fit(x_train, y_train, epochs = 100, batch_size = 32, validation_split = 0.1)

Epoch 1/100
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 32ms/step - loss: 0.0598 - val_loss: 0.0621
Epoch 2/100
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 30ms/step - loss: 0.0574 - val_loss: 0.0620
Epoch 3/100
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 30ms/step - loss: 0.0579 - val_loss: 0.0630
Epoch 4/100
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 30ms/step - loss: 0.0572 - val_loss: 0.0620
Epoch 5/100
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 31ms/step - loss: 0.0568 - val_loss: 0.0620
Epoch 6/100
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 30ms/step - loss: 0.0574 - val_loss: 0.0620
Epoch 7/100
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 31ms/step - loss: 0.0563 - val_loss: 0.0620
Epoch 8/100
[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 31ms/step - loss: 0.0584 - val_loss: 0.0619
Epoch 9/100
[1m265/265

In [139]:
#Evaluating the Model
predicted_review = model.predict(x_test)
predicted_review = scaler.inverse_transform(np.concatenate((np.zeros((predicted_review.shape[0], 2)), predicted_review), axis = 1))[:, 2]

#Inverse transform the actual medals
actual_review = scaler.inverse_transform(np.concatenate((np.zeros((y_test.shape[0], 2)), y_test.reshape(-1, 1)), axis = 1))[:, 2]

[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step


In [141]:
#Calculate Performance Metrics
mae = mean_absolute_error(actual_review, predicted_review)
mse = mean_squared_error(actual_review, predicted_review)
r2 = r2_score(actual_review, predicted_review)

print(f'MSE: {mse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

MSE: 595.3192512063712
MAE: 18.575428377097584
R2: -0.002433855535449947
