In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Embedding, GlobalMaxPooling1D
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import re
import pickle
import joblib


In [None]:
# Load the dataset
train_file_path = '/content/drive/My Drive/archive drug/drugsComTrain_raw.csv'
test_file_path = '/content/drive/My Drive/archive drug/drugsComTest_raw.csv'
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)
data = pd.concat([train_data, test_data])

In [None]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [None]:
# Apply preprocessing
data['review'] = data['review'].apply(preprocess_text)

# Fill missing values
data = data.fillna('')

In [None]:
# Encode the labels
le_drug = LabelEncoder()
data['drugName'] = le_drug.fit_transform(data['drugName'])

le_condition = LabelEncoder()
data['condition'] = le_condition.fit_transform(data['condition'])

In [None]:
# Split the data
X = data['review']
y = data[['drugName', 'condition']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from scipy.sparse import hstack

# Initialize the vectorizers
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
count_vectorizer = CountVectorizer(max_features=5000)

# Fit and transform the data
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

cv_train = count_vectorizer.fit_transform(X_train)
cv_test = count_vectorizer.transform(X_test)

# Combine the features
X_train_combined = hstack([tfidf_train, cv_train])
X_test_combined = hstack([tfidf_test, cv_test])

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences
max_length = 500
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length)

In [None]:
# Create the CNN model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.5),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.5),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(y_train.shape[1], activation='softmax')
])




In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=1, batch_size=256, validation_data=(X_test_padded, y_test))

[1m673/673[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m693s[0m 1s/step - accuracy: 0.8998 - loss: 58858928078848.0000 - val_accuracy: 0.9055 - val_loss: 1698954173480960.0000


<keras.src.callbacks.history.History at 0x7f8d998dae30>

In [None]:
# Function to get reviews and conditions by drug name
def get_reviews_and_conditions(drug_name):
    try:
        drug_name_encoded = le_drug.transform([drug_name])[0]
        filtered_data = data[data['drugName'] == drug_name_encoded]
        unique_conditions = filtered_data['condition'].unique()[:5]
        reviews = filtered_data['review'].values[:5]
        conditions = le_condition.inverse_transform(unique_conditions)
        return conditions, reviews
    except ValueError:
        return "Drug name not found in the dataset", []

# Function to get drugs and reviews by condition
def get_drugs_and_reviews(condition):
    try:
        condition_encoded = le_condition.transform([condition])[0]
        filtered_data = data[data['condition'] == condition_encoded]
        unique_drugs = filtered_data['drugName'].unique()[:5]
        reviews = filtered_data['review'].values[:5]
        drugs = le_drug.inverse_transform(unique_drugs)
        return drugs, reviews
    except ValueError:
        return "Condition not found in the dataset", []


In [None]:


# Simulate user input for condition
user_input_condition = input("Enter condition: ")
drugs, reviews = get_drugs_and_reviews(user_input_condition)
print(f"Drugs associated with {user_input_condition}: {drugs}")
print(f"Reviews for {user_input_condition}: {reviews}")

Enter condition: Depression
Drugs associated with Depression: ['L-methylfolate' 'Sertraline' 'Venlafaxine' 'Effexor XR' 'Wellbutrin']
Reviews for Depression: ['i have taken antidepressants for years with some improvement but mostly moderate to severe side affects which makes me go off them i only take cymbalta now mostly for pain when i began deplin i noticed a major improvement overnight more energy better disposition and no sinking to the low lows of major depression i have been taking it for about months now and feel like a normal person for the first time ever best thing no side effects'
 'week on zoloft for anxiety and mood swings i take mg in the mornings with my breakfast nausea on day one but that subsided as the week went on i get the jitters about hrs after taking it followed by yawning i feel much better though and less angrystressed'
 'my gp started me on venlafaxine yesterday to help with depression and the changea hour after taking them i was feeling very sick couldnt sto

In [None]:
# Save the model
model.save('/content/drive/My Drive/drug/condition_exploration.h5')

# Save the tokenizer
with open('/content/drive/My Drive/drug/tokenizer_condition.pkl', 'wb') as file:
    joblib.dump(tokenizer, file)

# Save the label encoders
with open('/content/drive/My Drive/drug/le_dr.pkl', 'wb') as file:
    joblib.dump(le_drug, file)

with open('/content/drive/My Drive/drug/le_co.pkl', 'wb') as file:
    joblib.dump(le_condition, file)


