In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
df = pd.read_csv('Pakistan Medicines Dataset.csv')
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, RobustScaler, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')
df

Unnamed: 0,Drug Name,Manufacturer,Strength,Form,Indication,Side Effects,Available In,Age Restriction,Prescription Required,Price
0,Panadol,GSK,500mg,Tablet,"Fever, mild pain",Liver damage (high dose),All cities,,No,35.0
1,Arinac Forte,Abott,400mg + 60mg,Tablet,"Cold, flu, fever, nasal congestion","Nausea, dizziness, dry mouth etc",All cities,12+,Yes,
2,Arinac,Abbott,200mg + 30mg,,"Cold, flu, nasal congestion","Drowsiness, high BP",All cities,12+,Yes,65.0
3,OMC-D,Sami Pharma,20mg,Tablet,"Acidity, gastric issues","Headache, diarrhea",All cities,12+,Yes,100.0
4,Paracetamol,Various,500mg,Tablet,"Fever, pain",Liver issues (overdose),All cities,,No,30.0
...,...,...,...,...,...,...,...,...,...,...
92,Voveran Emulgel,Novartis,1.16% Diclofenac Diethylamine,Gel,Joint/muscle inflammation,Skin irritation,All cities,12+,No,120.0
93,Terbinafine Gel,Sami Pharma,1% Terbinafine,Gel,Skin fungal infection,"Rash, dryness",All cities,12+,Yes,110.0
94,Aczone Gel,Ali Gohar Pharma,5% Dapsone,Gel,Acne vulgaris,"Itching, burning",All cities,12+,Yes,190.0
95,Aloe Vera Gel,Pure Leaf,,Gel,"Skin moisturizing, sunburn",No major side effects,All cities,All ages,No,60.0


In [2]:
df.isnull().sum().sort_values(ascending=False)

Form                     21
Price                    21
Age Restriction           3
Indication                2
Side Effects              1
Drug Name                 0
Manufacturer              0
Strength                  0
Available In              0
Prescription Required     0
dtype: int64

In [3]:
knn=KNNImputer()
df['Price']=knn.fit_transform(df[['Price']])
df

Unnamed: 0,Drug Name,Manufacturer,Strength,Form,Indication,Side Effects,Available In,Age Restriction,Prescription Required,Price
0,Panadol,GSK,500mg,Tablet,"Fever, mild pain",Liver damage (high dose),All cities,,No,35.000000
1,Arinac Forte,Abott,400mg + 60mg,Tablet,"Cold, flu, fever, nasal congestion","Nausea, dizziness, dry mouth etc",All cities,12+,Yes,102.736842
2,Arinac,Abbott,200mg + 30mg,,"Cold, flu, nasal congestion","Drowsiness, high BP",All cities,12+,Yes,65.000000
3,OMC-D,Sami Pharma,20mg,Tablet,"Acidity, gastric issues","Headache, diarrhea",All cities,12+,Yes,100.000000
4,Paracetamol,Various,500mg,Tablet,"Fever, pain",Liver issues (overdose),All cities,,No,30.000000
...,...,...,...,...,...,...,...,...,...,...
92,Voveran Emulgel,Novartis,1.16% Diclofenac Diethylamine,Gel,Joint/muscle inflammation,Skin irritation,All cities,12+,No,120.000000
93,Terbinafine Gel,Sami Pharma,1% Terbinafine,Gel,Skin fungal infection,"Rash, dryness",All cities,12+,Yes,110.000000
94,Aczone Gel,Ali Gohar Pharma,5% Dapsone,Gel,Acne vulgaris,"Itching, burning",All cities,12+,Yes,190.000000
95,Aloe Vera Gel,Pure Leaf,,Gel,"Skin moisturizing, sunburn",No major side effects,All cities,All ages,No,60.000000


In [4]:
df.select_dtypes(include=['float','int']).columns

Index(['Price'], dtype='object')

In [5]:
df.select_dtypes(include=['object','category']).columns

Index(['Drug Name', 'Manufacturer', 'Strength', 'Form', 'Indication',
       'Side Effects', 'Available In', 'Age Restriction',
       'Prescription Required'],
      dtype='object')

In [6]:
fig=px.box(df,x='Price')
fig.show()

In [7]:
fig2=px.violin(df,x='Price')
fig2.show()

In [8]:
cat_cols=['Drug Name', 'Manufacturer', 'Strength', 'Form', 'Indication','Side Effects', 'Available In', 'Age Restriction','Prescription Required']

In [9]:
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

In [10]:
# Combine into a column transformer
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_cols)
])

In [11]:
X=df.drop('Price',axis=1)
y=df['Price']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)


In [12]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [13]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train_processed, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score,mean_absolute_percentage_error,mean_squared_error

y_pred = model.predict(X_test_processed)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", root_mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAPE:", mean_absolute_percentage_error(y_test, y_pred))




MAE: 84.62557894736847
RMSE: 111.58680625215864
R2 Score: 0.15712064853288188
MSE: 12451.61532955679
MAPE: 1.4108917239111194
