In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


# Step 1: Data Preprocessing
data = pd.read_csv("data.csv")  # Replace "your_data.csv" with the actual filename
target_columns = ["Disease", "Prescription"]

data

Unnamed: 0,Location,Gender,Age,BMI,Allergy,Symptoms,Duration of Symptoms,Current Disease Info,Occupation,Smoker,BP Range,Temp Range,Disease,Prescription
0,Thrissur,Male,35,26,,"Fever, Cough, Fatigue",5,,Engineer,No,120/80,37.5°C,Common Cold,"Rest, Plenty of Fluids"
1,Kozhikode,Female,42,30,Pollen,"Sneezing, Itchy Eyes",3,,Teacher,No,130/90,36.9°C,Allergic Rhinitis,Antihistamines
2,Alappuzha,Male,28,23,,"Headache, Sore Throat",2,,Accountant,No,110/70,37.2°C,Tonsillitis,"Analgesics, Antibiotics"
3,Ernakulam,Female,50,29,,"Fatigue, Joint Pain, Muscle Aches",7,,Nurse,No,125/80,37.8°C,Influenza,"Bed Rest, Antiviral Medication"
4,Malappuram,Male,45,31,,"Chest Pain, Shortness of Breath",1,,Doctor,No,135/85,37.4°C,Coronary Artery Disease,"Aspirin, Statins"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,Kottayam,Male,38,26,,"Fever, Sore Throat, Cough",5,,Teacher,No,130/90,37.6°C,Pharyngitis,"Analgesics, Antibiotics"
83,Kannur,Female,58,29,Dust,"Sneezing, Itchy Eyes",3,,Nurse,No,120/80,37.4°C,Allergic Rhinitis,Antihistamines
84,Pathanamthitta,Male,35,27,,"Fatigue, Shortness of Breath",7,,Pharmacist,No,115/75,37.0°C,Asthma,"Bronchodilators, Inhalers"
85,Kasaragod,Female,50,28,,"Cough, Sore Throat, Runny Nose",5,,Accountant,No,125/80,36.8°C,Upper Respiratory Infection,"Rest, Fluids, Cough Syrup"


In [5]:
feature_columns = [col for col in data.columns if col not in target_columns]

# Handle missing values if any
data.fillna(0, inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
for col in feature_columns:
    if data[col].dtype == "object":
        data[col] = label_encoder.fit_transform(data[col])

feature_columns


['Location',
 'Gender',
 'Age',
 'BMI',
 'Allergy',
 'Symptoms',
 'Duration of Symptoms',
 'Current Disease Info',
 'Occupation',
 'Smoker',
 'BP Range',
 'Temp Range']

In [7]:
# Scale numerical features
scaler = StandardScaler()
data[feature_columns] = scaler.fit_transform(data[feature_columns])

# Split data into input features and target variables
X = data[feature_columns]
y = data[target_columns]



X
y

Unnamed: 0,Disease,Prescription
0,Common Cold,"Rest, Plenty of Fluids"
1,Allergic Rhinitis,Antihistamines
2,Tonsillitis,"Analgesics, Antibiotics"
3,Influenza,"Bed Rest, Antiviral Medication"
4,Coronary Artery Disease,"Aspirin, Statins"
...,...,...
82,Pharyngitis,"Analgesics, Antibiotics"
83,Allergic Rhinitis,Antihistamines
84,Asthma,"Bronchodilators, Inhalers"
85,Upper Respiratory Infection,"Rest, Fluids, Cough Syrup"


In [8]:
# Step 2: Model Architecture
model = keras.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(len(feature_columns),)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(len(target_columns), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [12]:
X_test
y_train




# Step 3: Model Training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Convert target variable to integer values
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Convert integer labels to one-hot encoded vectors
num_classes = len(label_encoder.classes_)
y_train_one_hot = to_categorical(y_train_encoded, num_classes=num_classes)
y_test_one_hot = to_categorical(y_test_encoded, num_classes=num_classes)

# Step 3: Model Training
model.fit(X_train, y_train_one_hot, epochs=10, batch_size=32, validation_data=(X_test, y_test_one_hot))



ValueError: y should be a 1d array, got an array of shape (78, 2) instead.

In [None]:

# Step 4: Model Evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


# Step 5: Model Fine-tuning
# Adjust hyperparameters, modify architecture, or introduce regularization techniques


# Step 6: Model Deployment
model.save("dnn_model.h5")  # Save the trained model for future use

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Step 1: Load and preprocess the data
data = pd.read_csv('your_dataset.csv')
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y = data[['Disease', 'Prescription']]

# Encoding categorical features
encoder = LabelEncoder()
for column in X.columns:
    if X[column].dtype == np.object:
        X[column] = encoder.fit_transform(X[column])

# One-hot encoding the target variable
y_encoded = pd.get_dummies(y)

# Step 2: Define the model
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X.shape[1]))
model.add(Dense(32, activation='relu'))
model.add(Dense(y_encoded.shape[1], activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 3: Model Training
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.1, random_state=42)

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Step 4: Model Evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


  if X[column].dtype == np.object:


AttributeError: module 'numpy' has no attribute 'object'.
`np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe. 
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [72]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Step 1: Load and preprocess the data
data = pd.read_csv('data.csv')
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y = data[['Disease', 'Prescription']]

# Encoding categorical features
encoder = LabelEncoder()
for column in X.columns:
    if X[column].dtype == object:
        X[column] = encoder.fit_transform(X[column])

# One-hot encoding the target variable
y_encoded = pd.get_dummies(y)

# Step 2: Define the model
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X.shape[1]))
model.add(Dense(32, activation='relu'))
model.add(Dense(y_encoded.shape[1], activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 3: Model Training
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.1, random_state=42)

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Step 4: Model Evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = encoder.fit_transform(X[column])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 7.235806941986084
Test Accuracy: 0.2222222238779068


In [None]:
model.save('model.h5')


In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Step 1: Load and preprocess the data
data = pd.read_csv('data.csv')
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y = data[['Disease', 'Prescription']]

# Encoding categorical features
encoder = LabelEncoder()
for column in X.columns:
    if X[column].dtype == object:
        X[column] = encoder.fit_transform(X[column])

# One-hot encoding the target variable
y_encoded = pd.get_dummies(y)

# Define priority percentages for each column
priority_percentages = [0.2, 0.2, 0.2, 0.15, 0.10, 0.40, 0.15, 0.15, 0.05, 0.15, 0.15, 0.15]

# Normalize priority percentages
priority_weights = np.array(priority_percentages) / np.sum(priority_percentages)

# Multiply features with priority weights
weighted_X = X * priority_weights

# Step 2: Define the model
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=weighted_X.shape[1]))
model.add(Dense(32, activation='relu'))
model.add(Dense(y_encoded.shape[1], activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 3: Model Training
X_train, X_test, y_train, y_test = train_test_split(weighted_X, y_encoded, test_size=0.1, random_state=42)

model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

# Step 4: Model Evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = encoder.fit_transform(X[column])


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Loss: 5.8635640144348145
Test Accuracy: 0.03703703731298447


In [44]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib

# Step 1: Load and preprocess the data
data = pd.read_csv('data.csv')
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y = data[['Disease', 'Prescription']]

# Encoding categorical features
encoder = LabelEncoder()
for column in X.columns:
    if X[column].dtype == object:
        X[column] = encoder.fit_transform(X[column])

# One-hot encoding the target variable
y_encoded = pd.get_dummies(y)

# Define priority percentages for each column
priority_percentages = [0.2, 0.2, 0.2, 0.15, 0.10, 0.40, 0.15, 0.15, 0.05, 0.15, 0.15, 0.15]

# Normalize priority percentages
priority_weights = np.array(priority_percentages) / np.sum(priority_percentages)

# Multiply features with priority weights
weighted_X = X * priority_weights

# Step 2: Define and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(weighted_X, y_encoded, test_size=0.1, random_state=42)
model.fit(X_train, y_train)

# Step 3: Make predictions and evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Test Accuracy:", accuracy)

# Step 3: Save the model
joblib.dump(model, 'random_forest_model.pkl')

# Step 4: Load the model
loaded_model = joblib.load('random_forest_model.pkl')

# Step 5: Prompt user to input new data
print("Please provide the following information:")
location = input("Location: ")
gender = input("Gender: ")
age = float(input("Age: "))
bmi = float(input("BMI: "))
allergy = input("Allergy: ")
symptoms = input("Symptoms: ")
duration = float(input("Duration of Symptoms: "))
disease_info = input("Current Disease Info: ")
occupation = input("Occupation: ")
smoker = input("Smoker (yes/no): ")
bp_range = input("BP Range: ")
temp_range = input("Temp Range: ")

# Preprocess the new data
input_data = pd.DataFrame({
    'Location': [location],
    'Gender': [gender],
    'Age': [age],
    'BMI': [bmi],
    'Allergy': [allergy],
    'Symptoms': [symptoms],
    'Duration of Symptoms': [duration],
    'Current Disease Info': [disease_info],
    'Occupation': [occupation],
    'Smoker': [smoker],
    'BP Range': [bp_range],
    'Temp Range': [temp_range]
})

X_new = input_data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
X_new_encoded = encoder.transform(X_new).toarray()
priority_weights_new = np.array(priority_percentages) / np.sum(priority_percentages)
weighted_X_new = X_new_encoded * priority_weights_new

# Make predictions on the new data
predictions = loaded_model.predict(weighted_X_new)

# Print the predictions
print(predictions)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = encoder.fit_transform(X[column])


Test Accuracy: 0.9629629629629629
Please provide the following information:
Location: Ernakulam
Gender: Male
Age: 21
BMI: 25
Allergy: None
Symptoms: cough,fever
Duration of Symptoms: 4
Current Disease Info: None
Occupation: Engineer
Smoker (yes/no): no
BP Range: 120/80
Temp Range: 37.5°C


ValueError: y should be a 1d array, got an array of shape (1, 12) instead.

In [43]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

# Step 1: Load and preprocess the data
data = pd.read_csv('data.csv')
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y = data[['Disease', 'Prescription']]

# Encoding categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
X_encoded = encoder.fit_transform(X).toarray()

# One-hot encoding the target variable
y_encoded = pd.get_dummies(y)

# Define priority percentages for each column
priority_percentages = [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]

# Multiply features with priority weights
weighted_X = X_encoded * priority_percentages

# Step 2: Define and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(weighted_X, y_encoded, test_size=0.1, random_state=42)
model.fit(X_train, y_train)

# Step 3: Save the model
joblib.dump(model, 'random_forest_model.pkl')

# Step 4: Load the model
loaded_model = joblib.load('random_forest_model.pkl')

# Step 5: Test the model on new data
new_data = pd.read_csv('new_data.csv')  # Load your new data from a CSV file
X_new = new_data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]

# Preprocess the new data
X_new_encoded = encoder.transform(X_new).toarray()
priority_weights_new = np.array(priority_percentages) / np.sum(priority_percentages)
weighted_X_new = X_new_encoded * priority_weights_new

# Make predictions on the new data
predictions = loaded_model.predict(weighted_X_new)

# Print the predictions
print(predictions)


ValueError: operands could not be broadcast together with shapes (267,147) (12,) 

In [45]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

# Step 1: Load and preprocess the data
data = pd.read_csv('data.csv')
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y = data[['Disease', 'Prescription']]

# Encoding categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
X_encoded = encoder.fit_transform(X).toarray()

# One-hot encoding the target variable
y_encoded = pd.get_dummies(y)

# Define priority percentages for each column
priority_percentages = [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]

# Multiply features with priority weights
weighted_X = X_encoded * priority_percentages

# Step 2: Define and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(weighted_X, y_encoded)

# Step 3: Save the model
joblib.dump(model, 'random_forest_model.pkl')

# Step 4: Load the model
loaded_model = joblib.load('random_forest_model.pkl')

# Step 5: Prompt user to input new data
print("Please provide the following information:")
location = input("Location: ")
gender = input("Gender: ")
age = float(input("Age: "))
bmi = float(input("BMI: "))
allergy = input("Allergy: ")
symptoms = input("Symptoms: ")
duration = float(input("Duration of Symptoms: "))
disease_info = input("Current Disease Info: ")
occupation = input("Occupation: ")
smoker = input("Smoker (yes/no): ")
bp_range = input("BP Range: ")
temp_range = input("Temp Range: ")

# Preprocess the new data
input_data = pd.DataFrame({
    'Location': [location],
    'Gender': [gender],
    'Age': [age],
    'BMI': [bmi],
    'Allergy': [allergy],
    'Symptoms': [symptoms],
    'Duration of Symptoms': [duration],
    'Current Disease Info': [disease_info],
    'Occupation': [occupation],
    'Smoker': [smoker],
    'BP Range': [bp_range],
    'Temp Range': [temp_range]
})

X_new = input_data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
X_new_encoded = encoder.transform(X_new).toarray()
priority_weights_new = np.array(priority_percentages) / np.sum(priority_percentages)
weighted_X_new = X_new_encoded * priority_weights_new

# Make predictions on the new data
predictions = loaded_model.predict(weighted_X_new)

# Print the predictions
print(predictions)


ValueError: operands could not be broadcast together with shapes (267,147) (12,) 

In [46]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

# Step 1: Load and preprocess the data
data = pd.read_csv('data.csv')
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y = data['Disease']

# Encoding categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
X_encoded = encoder.fit_transform(X).toarray()

# One-hot encoding the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Define priority percentages for each column
priority_percentages = [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]

# Multiply features with priority weights
weighted_X = X_encoded * priority_percentages

# Step 2: Define and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(weighted_X, y_encoded, test_size=0.1, random_state=42)
model.fit(X_train, y_train)

# Step 3: Save the model
joblib.dump(model, 'random_forest_model.pkl')

# Step 4: Load the model
loaded_model = joblib.load('random_forest_model.pkl')

# Step 5: Test the model on new data
new_data = pd.read_csv('new_data.csv')  # Load your new data from a CSV file
X_new = new_data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]

# Preprocess the new data
X_new_encoded = encoder.transform(X_new).toarray()
priority_weights_new = np.array(priority_percentages) / np.sum(priority_percentages)
weighted_X_new = X_new_encoded * priority_weights_new

# Make predictions on the new data
predictions = loaded_model.predict(weighted_X_new)

# Convert predictions back to disease labels
predicted_diseases = label_encoder.inverse_transform(predictions)

# Print the predictions
print(predicted_diseases)


ValueError: operands could not be broadcast together with shapes (267,147) (12,) 

In [47]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

# Step 1: Load and preprocess the data
data = pd.read_csv('data.csv')

# Handling Missing Values (if any)
data.fillna(value='NA', inplace=True)

X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y = data[['Disease', 'Prescription']]

# Encoding categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
X_encoded = encoder.fit_transform(X).toarray()

# One-hot encoding the target variable
y_encoded = pd.get_dummies(y)

# Define priority percentages for each column
priority_percentages = [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]

# Multiply features with priority weights
weighted_X = X_encoded * priority_percentages

# Step 2: Define and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(weighted_X, y_encoded)

# Step 3: Save the model
joblib.dump(model, 'random_forest_model.pkl')

# Step 4: Load the model
loaded_model = joblib.load('random_forest_model.pkl')

# Step 5: Prompt user to input new data
print("Please provide the following information:")
location = input("Location: ")
gender = input("Gender: ")
age = float(input("Age: "))
bmi = float(input("BMI: "))
allergy = input("Allergy: ")
symptoms = input("Symptoms: ")
duration = float(input("Duration of Symptoms: "))
disease_info = input("Current Disease Info: ")
occupation = input("Occupation: ")
smoker = input("Smoker (yes/no): ")
smoker = 1 if smoker.lower() == "yes" else 0
bp_range = input("BP Range: ")
temp_range = input("Temp Range: ")

# Preprocess the new data
input_data = pd.DataFrame({
    'Location': [location],
    'Gender': [gender],
    'Age': [age],
    'BMI': [bmi],
    'Allergy': [allergy],
    'Symptoms': [symptoms],
    'Duration of Symptoms': [duration],
    'Current Disease Info': [disease_info],
    'Occupation': [occupation],
    'Smoker': [smoker],
    'BP Range': [bp_range],
    'Temp Range': [temp_range]
})

X_new = input_data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
X_new_encoded = encoder.transform(X_new).toarray()
priority_weights_new = np.array(priority_percentages) / np.sum(priority_percentages)
weighted_X_new = X_new_encoded.reshape(1, -1) * priority_weights_new.reshape(1, -1)

# Make predictions on the new data
predictions = loaded_model.predict(weighted_X_new)

# Print the predictions
print(predictions)


ValueError: operands could not be broadcast together with shapes (267,147) (12,) 

In [48]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

# Step 1: Load and preprocess the data
data = pd.read_csv('data.csv')

# Handling Missing Values (if any)
data.fillna(value='NA', inplace=True)

X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y = data[['Disease', 'Prescription']]

# Encoding categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
X_encoded = encoder.fit_transform(X).toarray()

# One-hot encoding the target variable
y_encoded = pd.get_dummies(y)

# Define priority percentages for each column
priority_percentages = [0.2] * X_encoded.shape[1]

# Multiply features with priority weights
weighted_X = X_encoded * priority_percentages

# Step 2: Define and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(weighted_X, y_encoded)

# Step 3: Save the model
joblib.dump(model, 'random_forest_model.pkl')

# Step 4: Load the model
loaded_model = joblib.load('random_forest_model.pkl')

# Step 5: Prompt user to input new data
print("Please provide the following information:")
location = input("Location: ")
gender = input("Gender: ")
age = float(input("Age: "))
bmi = float(input("BMI: "))
allergy = input("Allergy: ")
symptoms = input("Symptoms: ")
duration = float(input("Duration of Symptoms: "))
disease_info = input("Current Disease Info: ")
occupation = input("Occupation: ")
smoker = input("Smoker (yes/no): ")
smoker = 1 if smoker.lower() == "yes" else 0
bp_range = input("BP Range: ")
temp_range = input("Temp Range: ")

# Preprocess the new data
input_data = pd.DataFrame({
    'Location': [location],
    'Gender': [gender],
    'Age': [age],
    'BMI': [bmi],
    'Allergy': [allergy],
    'Symptoms': [symptoms],
    'Duration of Symptoms': [duration],
    'Current Disease Info': [disease_info],
    'Occupation': [occupation],
    'Smoker': [smoker],
    'BP Range': [bp_range],
    'Temp Range': [temp_range]
})

X_new = input_data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
X_new_encoded = encoder.transform(X_new).toarray()
priority_weights_new = np.array(priority_percentages) / np.sum(priority_percentages)
weighted_X_new = X_new_encoded.reshape(1, -1) * priority_weights_new

# Make predictions on the new data
predictions = loaded_model.predict(weighted_X_new)

# Print the predictions
print(predictions)


Please provide the following information:
Location: Ernakulam
Gender: Male
Age: 21
BMI: 25
Allergy: None
Symptoms: mild fever,headache
Duration of Symptoms: 4
Current Disease Info: None
Occupation: Engineer
Smoker (yes/no): no
BP Range: 120/80
Temp Range: 37.2°C


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [49]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

# Step 1: Load and preprocess the data
data = pd.read_csv('data.csv')

# Handling Missing Values (if any)
data.fillna(value='NA', inplace=True)

X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y = data[['Disease', 'Prescription']]

# Encoding categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=int)  # Set sparse=False and dtype=int
X_encoded = encoder.fit_transform(X)

# One-hot encoding the target variable
y_encoded = pd.get_dummies(y)

# Define priority percentages for each column
priority_percentages = [0.2] * X_encoded.shape[1]

# Multiply features with priority weights
weighted_X = X_encoded * priority_percentages

# Step 2: Define and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(weighted_X, y_encoded)

# Step 3: Save the model
joblib.dump(model, 'random_forest_model.pkl')

# Step 4: Load the model
loaded_model = joblib.load('random_forest_model.pkl')

# Step 5: Prompt user to input new data
print("Please provide the following information:")
location = input("Location: ")
gender = input("Gender: ")
age = float(input("Age: "))
bmi = float(input("BMI: "))
allergy = input("Allergy: ")
symptoms = input("Symptoms: ")
duration = float(input("Duration of Symptoms: "))
disease_info = input("Current Disease Info: ")
occupation = input("Occupation: ")
smoker = input("Smoker (yes/no): ")
smoker = 1 if smoker.lower() == "yes" else 0
bp_range = input("BP Range: ")
temp_range = input("Temp Range: ")

# Preprocess the new data
input_data = pd.DataFrame({
    'Location': [location],
    'Gender': [gender],
    'Age': [age],
    'BMI': [bmi],
    'Allergy': [allergy],
    'Symptoms': [symptoms],
    'Duration of Symptoms': [duration],
    'Current Disease Info': [disease_info],
    'Occupation': [occupation],
    'Smoker': [smoker],
    'BP Range': [bp_range],
    'Temp Range': [temp_range]
})

X_new = input_data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
X_new_encoded = encoder.transform(X_new)
priority_weights_new = np.array(priority_percentages) / np.sum(priority_percentages)
weighted_X_new = X_new_encoded * priority_weights_new

# Make predictions on the new data
predictions = loaded_model.predict(weighted_X_new)

# Print the predictions
print(predictions)


Please provide the following information:
Location: Ernakulam
Gender: Male
Age: 21
BMI: 25
Allergy: None
Symptoms: mild fever,cold
Duration of Symptoms: 4
Current Disease Info: None
Occupation: Engineer
Smoker (yes/no): no
BP Range: 120/80
Temp Range: 37.0°C


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [50]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

# Step 1: Load and preprocess the data
data = pd.read_csv('data.csv')

# Handling Missing Values (if any)
data.fillna(value='NA', inplace=True)

X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y = data[['Disease', 'Prescription']]

# Encoding categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=int)  # Set sparse=False and dtype=int
X_encoded = encoder.fit_transform(X)

# One-hot encoding the target variable
y_encoded = pd.get_dummies(y)

# Define priority percentages for each column
priority_percentages = [0.2] * X_encoded.shape[1]

# Multiply features with priority weights
weighted_X = X_encoded * priority_percentages

# Step 2: Define and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(weighted_X, y_encoded)

# Step 3: Save the model
joblib.dump(model, 'random_forest_model.pkl')

# Step 4: Load the model
loaded_model = joblib.load('random_forest_model.pkl')

# Step 5: Prompt user to input new data
print("Please provide the following information:")
location = input("Location: ")
gender = input("Gender: ")
age = float(input("Age: "))
bmi = float(input("BMI: "))
allergy = input("Allergy: ")
symptoms = input("Symptoms: ")
duration = float(input("Duration of Symptoms: "))
disease_info = input("Current Disease Info: ")
occupation = input("Occupation: ")
smoker = input("Smoker (yes/no): ")
smoker = 1 if smoker.lower() == "yes" else 0
bp_range = input("BP Range: ")
temp_range = input("Temp Range: ")

# Preprocess the new data
input_data = pd.DataFrame({
    'Location': [location],
    'Gender': [gender],
    'Age': [age],
    'BMI': [bmi],
    'Allergy': [allergy],
    'Symptoms': [symptoms],
    'Duration of Symptoms': [duration],
    'Current Disease Info': [disease_info],
    'Occupation': [occupation],
    'Smoker': [smoker],
    'BP Range': [bp_range],
    'Temp Range': [temp_range]
})

# Replace NaN values with 'NA'
input_data.fillna(value='NA', inplace=True)

X_new = input_data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
X_new_encoded = encoder.transform(X_new)
priority_weights_new = np.array(priority_percentages) / np.sum(priority_percentages)
weighted_X_new = X_new_encoded * priority_weights_new

# Make predictions on the new data
predictions = loaded_model.predict(weighted_X_new)

# Print the predictions
print(predictions)


Please provide the following information:
Location: Ernakulam
Gender: Male
Age: 21
BMI: 25
Allergy: None
Symptoms: mild cold,fever
Duration of Symptoms: 4
Current Disease Info: None
Occupation: Engineer
Smoker (yes/no): no
BP Range: 37.0°C
Temp Range: 37.0°C


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [51]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

# Step 1: Load and preprocess the data
data = pd.read_csv('data.csv')

# Handling Missing Values (if any)
data.fillna(value='NA', inplace=True)

X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y = data[['Disease', 'Prescription']]

# Encoding categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=int)  # Set sparse=False and dtype=int
X_encoded = encoder.fit_transform(X)

# One-hot encoding the target variable
y_encoded = pd.get_dummies(y)

# Define priority percentages for each column
priority_percentages = [0.2] * X_encoded.shape[1]

# Multiply features with priority weights
weighted_X = X_encoded * priority_percentages

# Step 2: Define and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(weighted_X, y_encoded)

# Step 3: Save the model
joblib.dump(model, 'random_forest_model.pkl')

# Step 4: Load the model
loaded_model = joblib.load('random_forest_model.pkl')

# Step 5: Prompt user to input new data
print("Please provide the following information:")
location = input("Location: ")
gender = input("Gender: ")
age = float(input("Age: "))
bmi = float(input("BMI: "))
allergy = input("Allergy: ")
symptoms = input("Symptoms: ")
duration = float(input("Duration of Symptoms: "))
disease_info = input("Current Disease Info: ")
occupation = input("Occupation: ")
smoker = input("Smoker (yes/no): ")
smoker = 1 if smoker.lower() == "yes" else 0
bp_range = input("BP Range: ")
temp_range = input("Temp Range: ")

# Preprocess the new data
input_data = pd.DataFrame({
    'Location': [location],
    'Gender': [gender],
    'Age': [age],
    'BMI': [bmi],
    'Allergy': [allergy],
    'Symptoms': [symptoms],
    'Duration of Symptoms': [duration],
    'Current Disease Info': [disease_info],
    'Occupation': [occupation],
    'Smoker': [smoker],
    'BP Range': [bp_range],
    'Temp Range': [temp_range]
})

# Replace NaN values with 'NA'
input_data.fillna(value='NA', inplace=True)

X_new = input_data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
X_new_encoded = encoder.transform(X_new)

# Create priority weights for new data
priority_weights_new = np.array(priority_percentages) / np.sum(priority_percentages)

# Apply priority weights to new data
weighted_X_new = X_new_encoded * priority_weights_new.reshape(1, -1)

# Make predictions on the new data
predictions = loaded_model.predict(weighted_X_new)

# Print the predictions
print(predictions)


Please provide the following information:
Location: Ernakulam
Gender: Male
Age: 35
BMI: 23
Allergy: None
Symptoms: mild fever,cold
Duration of Symptoms: 4
Current Disease Info: None
Occupation: Engineer
Smoker (yes/no): no
BP Range: 120/80
Temp Range: 36.9°C


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [52]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import joblib

# Step 1: Load and preprocess the data
data = pd.read_csv('data.csv')

# Handling Missing Values (if any)
imputer = SimpleImputer(strategy='most_frequent')
data_filled = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

X = data_filled[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y = data_filled[['Disease', 'Prescription']]

# Encoding categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=int)  # Set sparse=False and dtype=int
X_encoded = encoder.fit_transform(X)

# One-hot encoding the target variable
y_encoded = pd.get_dummies(y)

# Define priority percentages for each column
priority_percentages = [0.2] * X_encoded.shape[1]

# Multiply features with priority weights
weighted_X = X_encoded * priority_percentages

# Step 2: Define and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(weighted_X, y_encoded)

# Step 3: Save the model
joblib.dump(model, 'random_forest_model.pkl')

# Step 4: Load the model
loaded_model = joblib.load('random_forest_model.pkl')

# Step 5: Prompt user to input new data
print("Please provide the following information:")
location = input("Location: ")
gender = input("Gender: ")
age = float(input("Age: "))
bmi = float(input("BMI: "))
allergy = input("Allergy: ")
symptoms = input("Symptoms: ")
duration = float(input("Duration of Symptoms: "))
disease_info = input("Current Disease Info: ")
occupation = input("Occupation: ")
smoker = input("Smoker (yes/no): ")
bp_range = input("BP Range: ")
temp_range = input("Temp Range: ")

# Preprocess the new data
input_data = pd.DataFrame({
    'Location': [location],
    'Gender': [gender],
    'Age': [age],
    'BMI': [bmi],
    'Allergy': [allergy],
    'Symptoms': [symptoms],
    'Duration of Symptoms': [duration],
    'Current Disease Info': [disease_info],
    'Occupation': [occupation],
    'Smoker': [smoker],
    'BP Range': [bp_range],
    'Temp Range': [temp_range]
})

# Handle missing values in new data
input_data_filled = pd.DataFrame(imputer.transform(input_data), columns=input_data.columns)

X_new = input_data_filled[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
X_new_encoded = encoder.transform(X_new)

# Create priority weights for new data
priority_weights_new = np.array(priority_percentages) / np.sum(priority_percentages)

# Apply priority weights to new data
weighted_X_new = X_new_encoded * priority_weights_new

# Make predictions on the new data
predictions = loaded_model.predict(weighted_X_new)

# Print the predictions
print(predictions)


Please provide the following information:
Location: Ernakulam
Gender: Male
Age: 21
BMI: 25
Allergy: None
Symptoms: mild cold,fever
Duration of Symptoms: 4
Current Disease Info: None
Occupation: Engineer
Smoker (yes/no): no
BP Range: 120/80
Temp Range: 37.2°C


Feature names seen at fit time, yet now missing:
- Disease
- Prescription



ValueError: X has 12 features, but SimpleImputer is expecting 14 features as input.

In [53]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import joblib

# Step 1: Load and preprocess the data
data = pd.read_csv('data.csv')

# Handling Missing Values (if any)
imputer = SimpleImputer(strategy='most_frequent')
data_filled = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

X = data_filled[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y = data_filled[['Disease', 'Prescription']]

# Encoding categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=int)  # Set sparse=False and dtype=int
X_encoded = encoder.fit_transform(X)

# One-hot encoding the target variable
label_encoder = LabelEncoder()
y_encoded = y.apply(label_encoder.fit_transform)

# Define priority percentages for each column
priority_percentages = [0.2] * X_encoded.shape[1]

# Multiply features with priority weights
weighted_X = X_encoded * priority_percentages

# Step 2: Define and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(weighted_X, y_encoded)

# Step 3: Save the model
joblib.dump(model, 'random_forest_model.pkl')

# Step 4: Load the model
loaded_model = joblib.load('random_forest_model.pkl')

# Step 5: Prompt user to input new data
print("Please provide the following information:")
location = input("Location: ")
gender = input("Gender: ")
age = float(input("Age: "))
bmi = float(input("BMI: "))
allergy = input("Allergy: ")
symptoms = input("Symptoms: ")
duration = float(input("Duration of Symptoms: "))
disease_info = input("Current Disease Info: ")
occupation = input("Occupation: ")
smoker = input("Smoker (yes/no): ")
bp_range = input("BP Range: ")
temp_range = input("Temp Range: ")

# Preprocess the new data
input_data = pd.DataFrame({
    'Location': [location],
    'Gender': [gender],
    'Age': [age],
    'BMI': [bmi],
    'Allergy': [allergy],
    'Symptoms': [symptoms],
    'Duration of Symptoms': [duration],
    'Current Disease Info': [disease_info],
    'Occupation': [occupation],
    'Smoker': [smoker],
    'BP Range': [bp_range],
    'Temp Range': [temp_range]
})

# Handle missing values in new data
input_data_filled = pd.DataFrame(imputer.transform(input_data), columns=input_data.columns)

X_new = input_data_filled[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
X_new_encoded = encoder.transform(X_new)

# Create priority weights for new data
priority_weights_new = np.array(priority_percentages) / np.sum(priority_percentages)

# Apply priority weights to new data
weighted_X_new = X_new_encoded * priority_weights_new

# Make predictions on the new data
predictions = loaded_model.predict(weighted_X_new)

# Print the predictions
print(predictions)


Please provide the following information:
Location: Ernakulam
Gender: Male
Age: 21
BMI: 25
Allergy: None
Symptoms: mild fever,headache
Duration of Symptoms: 4
Current Disease Info: None
Occupation: engineer
Smoker (yes/no): no
BP Range: 120/80
Temp Range: 37.2°C


Feature names seen at fit time, yet now missing:
- Disease
- Prescription



ValueError: X has 12 features, but SimpleImputer is expecting 14 features as input.

In [54]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import joblib

# Step 1: Load and preprocess the data
data = pd.read_csv('data.csv')

# Handling Missing Values (if any)
imputer = SimpleImputer(strategy='most_frequent')
data_filled = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

X = data_filled[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y = data_filled[['Disease', 'Prescription']]

# Encoding categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=int)  # Set sparse=False and dtype=int
X_encoded = encoder.fit_transform(X)

# One-hot encoding the target variable
label_encoder = LabelEncoder()
y_encoded = y.apply(label_encoder.fit_transform)

# Define priority percentages for each column
priority_percentages = [0.2] * X_encoded.shape[1]

# Multiply features with priority weights
weighted_X = X_encoded * priority_percentages

# Step 2: Define and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(weighted_X, y_encoded)

# Step 3: Save the model
joblib.dump(model, 'random_forest_model.pkl')

# Step 4: Load the model
loaded_model = joblib.load('random_forest_model.pkl')

# Step 5: Load new data from file
new_data = pd.read_csv('new_data.csv')

# Handle missing values in new data
new_data_filled = pd.DataFrame(imputer.transform(new_data), columns=new_data.columns)

X_new = new_data_filled[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
X_new_encoded = encoder.transform(X_new)

# Create priority weights for new data
priority_weights_new = np.array(priority_percentages) / np.sum(priority_percentages)

# Apply priority weights to new data
weighted_X_new = X_new_encoded * priority_weights_new

# Make predictions on the new data
predictions = loaded_model.predict(weighted_X_new)

# Print the predictions
print(predictions)


[[0 3]
 [0 3]
 [0 3]
 [0 3]
 [0 3]
 [0 3]
 [0 3]
 [0 3]
 [0 3]
 [0 3]
 [0 3]
 [0 3]
 [0 3]
 [0 3]]


In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Data Preprocessing
data = pd.read_csv('data.csv')

# Handle missing values, if any
data = data.dropna()

# Split the dataset into training and testing sets
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms',
          'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y_disease = data['Disease']
y_prescription = data['Prescription']
X_train, X_test, y_train_disease, y_test_disease, y_train_prescription, y_test_prescription = train_test_split(
    X, y_disease, y_prescription, test_size=0.2, random_state=42)

# Step 2: Text Data Processing
text_columns = ['Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation']

for column in text_columns:
    vectorizer = TfidfVectorizer()
    X_train[column] = vectorizer.fit_transform(X_train[column])
    X_test[column] = vectorizer.transform(X_test[column])

# Step 3: Feature Encoding
categorical_columns = ['Location', 'Gender', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']

for column in categorical_columns:
    le = LabelEncoder()
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = le.transform(X_test[column])

# Step 4: Model Training
model_disease = RandomForestClassifier()
model_prescription = RandomForestClassifier()

model_disease.fit(X_train, y_train_disease)
model_prescription.fit(X_train, y_train_prescription)

# Step 5: Model Evaluation
y_pred_disease = model_disease.predict(X_test)
y_pred_prescription = model_prescription.predict(X_test)

accuracy_disease = accuracy_score(y_test_disease, y_pred_disease)
accuracy_prescription = accuracy_score(y_test_prescription, y_pred_prescription)

print(f"Accuracy - Disease: {accuracy_disease}")
print(f"Accuracy - Prescription: {accuracy_prescription}")

# Step 6: Model Improvement
# To improve the model, you can perform hyperparameter tuning, feature selection, or use more advanced NLP techniques like transformers.

# Step 7: Iterate and Refine
# Collect more data, update the model, and repeat the above steps to continuously improve the model's performance.


TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Data Preprocessing
data = pd.read_csv('data.csv')

# Handle missing values, if any
data = data.dropna()

# Split the dataset into training and testing sets
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms',
          'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y_disease = data['Disease']
y_prescription = data['Prescription']
X_train, X_test, y_train_disease, y_test_disease, y_train_prescription, y_test_prescription = train_test_split(
    X, y_disease, y_prescription, test_size=0.2, random_state=42)

# Step 2: Text Data Processing
text_columns = ['Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation']

for column in text_columns:
    vectorizer = TfidfVectorizer()
    X_train[column] = vectorizer.fit_transform(X_train[column])
    X_test[column] = vectorizer.transform(X_test[column])

# Convert sparse matrix to dense matrix
X_train = X_train.values
X_test = X_test.values

# Step 3: Feature Encoding
categorical_columns = ['Location', 'Gender', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']

for column in categorical_columns:
    le = LabelEncoder()
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = le.transform(X_test[column])

# Step 4: Model Training
model_disease = RandomForestClassifier()
model_prescription = RandomForestClassifier()

model_disease.fit(X_train, y_train_disease)
model_prescription.fit(X_train, y_train_prescription)

# Step 5: Model Evaluation
y_pred_disease = model_disease.predict(X_test)
y_pred_prescription = model_prescription.predict(X_test)

accuracy_disease = accuracy_score(y_test_disease, y_pred_disease)
accuracy_prescription = accuracy_score(y_test_prescription, y_pred_prescription)

print(f"Accuracy - Disease: {accuracy_disease}")
print(f"Accuracy - Prescription: {accuracy_prescription}")

# Step 6: Model Improvement
# To improve the model, you can perform hyperparameter tuning, feature selection, or use more advanced NLP techniques like transformers.

# Step 7: Iterate and Refine
# Collect more data, update the model, and repeat the above steps to continuously improve the model's performance.


TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Data Preprocessing
data = pd.read_csv('data.csv')

# Handle missing values, if any
data = data.dropna()

# Split the dataset into training and testing sets
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms',
          'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y_disease = data['Disease']
y_prescription = data['Prescription']
X_train, X_test, y_train_disease, y_test_disease, y_train_prescription, y_test_prescription = train_test_split(
    X, y_disease, y_prescription, test_size=0.2, random_state=42)

# Step 2: Text Data Processing
text_columns = ['Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation']

for column in text_columns:
    vectorizer = TfidfVectorizer()
    X_train[column] = vectorizer.fit_transform(X_train[column]).toarray()
    X_test[column] = vectorizer.transform(X_test[column]).toarray()

# Step 3: Feature Encoding
categorical_columns = ['Location', 'Gender', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']

for column in categorical_columns:
    le = LabelEncoder()
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = le.transform(X_test[column])

# Step 4: Model Training
model_disease = RandomForestClassifier()
model_prescription = RandomForestClassifier()

model_disease.fit(X_train, y_train_disease)
model_prescription.fit(X_train, y_train_prescription)

# Step 5: Model Evaluation
y_pred_disease = model_disease.predict(X_test)
y_pred_prescription = model_prescription.predict(X_test)

accuracy_disease = accuracy_score(y_test_disease, y_pred_disease)
accuracy_prescription = accuracy_score(y_test_prescription, y_pred_prescription)

print(f"Accuracy - Disease: {accuracy_disease}")
print(f"Accuracy - Prescription: {accuracy_prescription}")

# Step 6: Model Improvement
# To improve the model, you can perform hyperparameter tuning, feature selection, or use more advanced NLP techniques like transformers.

# Step 7: Iterate and Refine
# Collect more data, update the model, and repeat the above steps to continuously improve the model's performance.


AttributeError: 'int' object has no attribute 'lower'

In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Data Preprocessing
data = pd.read_csv('data.csv')

# Handle missing values, if any
data = data.dropna()

# Convert relevant columns to strings
text_columns = ['Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation']
data[text_columns] = data[text_columns].astype(str)

# Split the dataset into training and testing sets
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms',
          'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y_disease = data['Disease']
y_prescription = data['Prescription']
X_train, X_test, y_train_disease, y_test_disease, y_train_prescription, y_test_prescription = train_test_split(
    X, y_disease, y_prescription, test_size=0.2, random_state=42)

# Step 2: Text Data Processing
for column in text_columns:
    vectorizer = TfidfVectorizer()
    X_train[column] = vectorizer.fit_transform(X_train[column]).toarray()
    X_test[column] = vectorizer.transform(X_test[column]).toarray()

# Step 3: Feature Encoding
categorical_columns = ['Location', 'Gender', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']

for column in categorical_columns:
    le = LabelEncoder()
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = le.transform(X_test[column])

# Step 4: Model Training
model_disease = RandomForestClassifier()
model_prescription = RandomForestClassifier()

model_disease.fit(X_train, y_train_disease)
model_prescription.fit(X_train, y_train_prescription)

# Step 5: Model Evaluation
y_pred_disease = model_disease.predict(X_test)
y_pred_prescription = model_prescription.predict(X_test)

accuracy_disease = accuracy_score(y_test_disease, y_pred_disease)
accuracy_prescription = accuracy_score(y_test_prescription, y_pred_prescription)

print(f"Accuracy - Disease: {accuracy_disease}")
print(f"Accuracy - Prescription: {accuracy_prescription}")

# Step 6: Model Improvement
# To improve the model, you can perform hyperparameter tuning, feature selection, or use more advanced NLP techniques like transformers.

# Step 7: Iterate and Refine
# Collect more data, update the model, and repeat the above steps to continuously improve the model's performance.


ValueError: y contains previously unseen labels: '120/75'

In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Data Preprocessing
data = pd.read_csv('data.csv')

# Handle missing values, if any
data = data.dropna()

# Convert relevant columns to strings
text_columns = ['Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation']
data[text_columns] = data[text_columns].astype(str)

# Encode target variables
le_disease = LabelEncoder()
le_prescription = LabelEncoder()
data['Disease'] = le_disease.fit_transform(data['Disease'])
data['Prescription'] = le_prescription.fit_transform(data['Prescription'])

# Split the dataset into training and testing sets
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms',
          'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y_disease = data['Disease']
y_prescription = data['Prescription']
X_train, X_test, y_train_disease, y_test_disease, y_train_prescription, y_test_prescription = train_test_split(
    X, y_disease, y_prescription, test_size=0.2, random_state=42)

# Step 2: Text Data Processing
for column in text_columns:
    vectorizer = TfidfVectorizer()
    X_train[column] = vectorizer.fit_transform(X_train[column]).toarray()
    X_test[column] = vectorizer.transform(X_test[column]).toarray()

# Step 3: Feature Encoding
categorical_columns = ['Location', 'Gender', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']

for column in categorical_columns:
    le = LabelEncoder()
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = le.transform(X_test[column])

# Step 4: Model Training
model_disease = RandomForestClassifier()
model_prescription = RandomForestClassifier()

model_disease.fit(X_train, y_train_disease)
model_prescription.fit(X_train, y_train_prescription)

# Step 5: Model Evaluation
y_pred_disease = model_disease.predict(X_test)
y_pred_prescription = model_prescription.predict(X_test)

accuracy_disease = accuracy_score(y_test_disease, y_pred_disease)
accuracy_prescription = accuracy_score(y_test_prescription, y_pred_prescription)

print(f"Accuracy - Disease: {accuracy_disease}")
print(f"Accuracy - Prescription: {accuracy_prescription}")

# Step 6: Model Improvement
# To improve the model, you can perform hyperparameter tuning, feature selection, or use more advanced NLP techniques like transformers.

# Step 7: Iterate and Refine
# Collect more data, update the model, and repeat the above steps to continuously improve the


ValueError: y contains previously unseen labels: '120/75'

In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Data Preprocessing
data = pd.read_csv('data.csv')

# Handle missing values, if any
data = data.dropna()

# Split the dataset into training and testing sets
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms',
          'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y_disease = data['Disease'].apply(lambda x: [x])  # Convert single disease label to list
y_prescription = data['Prescription'].apply(lambda x: [x])  # Convert single prescription label to list

X_train, X_test, y_train_disease, y_test_disease, y_train_prescription, y_test_prescription = train_test_split(
    X, y_disease, y_prescription, test_size=0.2, random_state=42)

# Step 2: Text Data Processing
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train['Allergy'] + ' ' + X_train['Symptoms'] + ' ' + X_train['Duration of Symptoms'] + ' ' + X_train['Current Disease Info'] + ' ' + X_train['Occupation'])
X_test_tfidf = vectorizer.transform(X_test['Allergy'] + ' ' + X_test['Symptoms'] + ' ' + X_test['Duration of Symptoms'] + ' ' + X_test['Current Disease Info'] + ' ' + X_test['Occupation'])

# Step 3: Multi-label Encoding
mlb_disease = MultiLabelBinarizer()
mlb_prescription = MultiLabelBinarizer()
y_train_disease_encoded = mlb_disease.fit_transform(y_train_disease)
y_train_prescription_encoded = mlb_prescription.fit_transform(y_train_prescription)

# Step 4: Model Training
classifier_disease = OneVsRestClassifier(RandomForestClassifier())
classifier_prescription = OneVsRestClassifier(RandomForestClassifier())

classifier_disease.fit(X_train_tfidf, y_train_disease_encoded)
classifier_prescription.fit(X_train_tfidf, y_train_prescription_encoded)

# Step 5: Model Evaluation
y_pred_disease_encoded = classifier_disease.predict(X_test_tfidf)
y_pred_prescription_encoded = classifier_prescription.predict(X_test_tfidf)

y_pred_disease = mlb_disease.inverse_transform(y_pred_disease_encoded)
y_pred_prescription = mlb_prescription.inverse_transform(y_pred_prescription_encoded)

accuracy_disease = accuracy_score(y_test_disease, y_pred_disease)
accuracy_prescription = accuracy_score(y_test_prescription, y_pred_prescription)

print(f"Accuracy - Disease: {accuracy_disease}")
print(f"Accuracy - Prescription: {accuracy_prescription}")

# Step 6: Model Improvement
# To improve the model, you can perform hyperparameter


TypeError: can only concatenate str (not "int") to str

In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Data Preprocessing
data = pd.read_csv('data.csv')

# Handle missing values, if any
data = data.dropna()

# Split the dataset into training and testing sets
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms',
          'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y_disease = data['Disease'].apply(lambda x: [x])  # Convert single disease label to list
y_prescription = data['Prescription'].apply(lambda x: [x])  # Convert single prescription label to list

X_train, X_test, y_train_disease, y_test_disease, y_train_prescription, y_test_prescription = train_test_split(
    X, y_disease, y_prescription, test_size=0.2, random_state=42)

# Step 2: Text Data Processing
vectorizer = TfidfVectorizer()
X_train_text = X_train['Allergy'].astype(str) + ' ' + X_train['Symptoms'].astype(str) + ' ' + X_train['Duration of Symptoms'].astype(str) + ' ' + X_train['Current Disease Info'].astype(str) + ' ' + X_train['Occupation'].astype(str)
X_test_text = X_test['Allergy'].astype(str) + ' ' + X_test['Symptoms'].astype(str) + ' ' + X_test['Duration of Symptoms'].astype(str) + ' ' + X_test['Current Disease Info'].astype(str) + ' ' + X_test['Occupation'].astype(str)

X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

# Step 3: Multi-label Encoding
mlb_disease = MultiLabelBinarizer()
mlb_prescription = MultiLabelBinarizer()
y_train_disease_encoded = mlb_disease.fit_transform(y_train_disease)
y_train_prescription_encoded = mlb_prescription.fit_transform(y_train_prescription)

# Step 4: Model Training
classifier_disease = OneVsRestClassifier(RandomForestClassifier())
classifier_prescription = OneVsRestClassifier(RandomForestClassifier())

classifier_disease.fit(X_train_tfidf, y_train_disease_encoded)
classifier_prescription.fit(X_train_tfidf, y_train_prescription_encoded)

# Step 5: Model Evaluation
y_pred_disease_encoded = classifier_disease.predict(X_test_tfidf)
y_pred_prescription_encoded = classifier_prescription.predict(X_test_tfidf)

y_pred_disease = mlb_disease.inverse_transform(y_pred_disease_encoded)
y_pred_prescription = mlb_prescription.inverse_transform(y_pred_prescription_encoded)

accuracy_disease = accuracy_score(y_test_disease, y_pred_disease)
accuracy_prescription = accuracy_score(y_test_prescription, y_pred_prescription)

print(f"Accuracy - Disease: {accuracy_disease}")
print(f"Accuracy - Prescription: {accuracy_prescription}")

# Step 6: Model Improvement
# To improve the model, you can perform hyperparameter tuning, feature selection, or use


ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.

In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Data Preprocessing
data = pd.read_csv('data.csv')

# Handle missing values, if any
data = data.dropna()

# Split the dataset into training and testing sets
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms',
          'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y_disease = data['Disease']
y_prescription = data['Prescription']

X_train, X_test, y_train_disease, y_test_disease, y_train_prescription, y_test_prescription = train_test_split(
    X, y_disease, y_prescription, test_size=0.2, random_state=42)

# Step 2: Text Data Processing
vectorizer = TfidfVectorizer()
X_train_text = X_train['Allergy'].astype(str) + ' ' + X_train['Symptoms'].astype(str) + ' ' + X_train['Duration of Symptoms'].astype(str) + ' ' + X_train['Current Disease Info'].astype(str) + ' ' + X_train['Occupation'].astype(str)
X_test_text = X_test['Allergy'].astype(str) + ' ' + X_test['Symptoms'].astype(str) + ' ' + X_test['Duration of Symptoms'].astype(str) + ' ' + X_test['Current Disease Info'].astype(str) + ' ' + X_test['Occupation'].astype(str)

X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

# Step 3: Multi-label Encoding
mlb_disease = MultiLabelBinarizer()
mlb_prescription = MultiLabelBinarizer()
y_train_disease_encoded = mlb_disease.fit_transform(y_train_disease.apply(lambda x: [x]))
y_train_prescription_encoded = mlb_prescription.fit_transform(y_train_prescription.apply(lambda x: [x]))

# Step 4: Model Training
classifier_disease = OneVsRestClassifier(RandomForestClassifier())
classifier_prescription = OneVsRestClassifier(RandomForestClassifier())

classifier_disease.fit(X_train_tfidf, y_train_disease_encoded)
classifier_prescription.fit(X_train_tfidf, y_train_prescription_encoded)

# Step 5: Model Evaluation
y_pred_disease_encoded = classifier_disease.predict(X_test_tfidf)
y_pred_prescription_encoded = classifier_prescription.predict(X_test_tfidf)

y_pred_disease = mlb_disease.inverse_transform(y_pred_disease_encoded)
y_pred_prescription = mlb_prescription.inverse_transform(y_pred_prescription_encoded)

accuracy_disease = accuracy_score(y_test_disease.apply(lambda x: [x]), y_pred_disease)
accuracy_prescription = accuracy_score(y_test_prescription.apply(lambda x: [x]), y_pred_prescription)

print(f"Accuracy - Disease: {accuracy_disease}")
print(f"Accuracy - Prescription: {accuracy_prescription}")

# Step 6: Model Improvement
# To improve


ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.

In [70]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Data Preprocessing
data = pd.read_csv('data.csv')

# Handle missing values, if any
data = data.dropna()

# Split the dataset into training and testing sets
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms',
          'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y_disease = data['Disease']
y_prescription = data['Prescription']

X_train, X_test, y_train_disease, y_test_disease, y_train_prescription, y_test_prescription = train_test_split(
    X, y_disease, y_prescription, test_size=0.2, random_state=42)

# Step 2: Text Data Processing
vectorizer = TfidfVectorizer()
X_train_text = X_train['Allergy'].astype(str) + ' ' + X_train['Symptoms'].astype(str) + ' ' + X_train['Duration of Symptoms'].astype(str) + ' ' + X_train['Current Disease Info'].astype(str) + ' ' + X_train['Occupation'].astype(str)
X_test_text = X_test['Allergy'].astype(str) + ' ' + X_test['Symptoms'].astype(str) + ' ' + X_test['Duration of Symptoms'].astype(str) + ' ' + X_test['Current Disease Info'].astype(str) + ' ' + X_test['Occupation'].astype(str)

X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

# Step 3: Multi-label Encoding
mlb_disease = MultiLabelBinarizer()
mlb_prescription = MultiLabelBinarizer()
y_train_disease_encoded = mlb_disease.fit_transform(y_train_disease.apply(lambda x: [x]))
y_train_prescription_encoded = mlb_prescription.fit_transform(y_train_prescription.apply(lambda x: [x]))

# Step 4: Model Training
classifier_disease = OneVsRestClassifier(RandomForestClassifier())
classifier_prescription = OneVsRestClassifier(RandomForestClassifier())

classifier_disease.fit(X_train_tfidf, y_train_disease_encoded)
classifier_prescription.fit(X_train_tfidf, y_train_prescription_encoded)

# Step 5: Model Evaluation
y_pred_disease_encoded = classifier_disease.predict(X_test_tfidf)
y_pred_prescription_encoded = classifier_prescription.predict(X_test_tfidf)

y_pred_disease = mlb_disease.inverse_transform(y_pred_disease_encoded)
y_pred_prescription = mlb_prescription.inverse_transform(y_pred_prescription_encoded)

accuracy_disease = accuracy_score(y_test_disease, y_pred_disease)
accuracy_prescription = accuracy_score(y_test_prescription, y_pred_prescription)

print(f"Accuracy - Disease: {accuracy_disease}")
print(f"Accuracy - Prescription: {accuracy_prescription}")

# Step 6: Model Improvement
# To improve the model, you can perform hyperparameter tuning, feature selection, or use


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (54,) + inhomogeneous part.

In [71]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

# Step 1: Data Preprocessing
data = pd.read_csv('data.csv')

# Handle missing values, if any
data = data.dropna()

# Split the dataset into training and testing sets
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms',
          'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y_disease = data['Disease']
y_prescription = data['Prescription']

X_train, X_test, y_train_disease, y_test_disease, y_train_prescription, y_test_prescription = train_test_split(
    X, y_disease, y_prescription, test_size=0.2, random_state=42)

# Step 2: Text Data Processing
vectorizer = TfidfVectorizer()
X_train_text = X_train['Allergy'].astype(str) + ' ' + X_train['Symptoms'].astype(str) + ' ' + X_train['Duration of Symptoms'].astype(str) + ' ' + X_train['Current Disease Info'].astype(str) + ' ' + X_train['Occupation'].astype(str)
X_test_text = X_test['Allergy'].astype(str) + ' ' + X_test['Symptoms'].astype(str) + ' ' + X_test['Duration of Symptoms'].astype(str) + ' ' + X_test['Current Disease Info'].astype(str) + ' ' + X_test['Occupation'].astype(str)

X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

# Step 3: Multi-label Encoding
mlb_disease = MultiLabelBinarizer()
mlb_prescription = MultiLabelBinarizer()
y_train_disease_encoded = mlb_disease.fit_transform(y_train_disease.apply(lambda x: [x]))
y_train_prescription_encoded = mlb_prescription.fit_transform(y_train_prescription.apply(lambda x: [x]))

# Step 4: Model Training
classifier_disease = OneVsRestClassifier(RandomForestClassifier())
classifier_prescription = OneVsRestClassifier(RandomForestClassifier())

classifier_disease.fit(X_train_tfidf, y_train_disease_encoded)
classifier_prescription.fit(X_train_tfidf, y_train_prescription_encoded)

# Step 5: Model Evaluation
y_pred_disease_encoded = classifier_disease.predict(X_test_tfidf)
y_pred_prescription_encoded = classifier_prescription.predict(X_test_tfidf)

# Convert binary arrays to sequences of labels
y_pred_disease = mlb_disease.inverse_transform(y_pred_disease_encoded)
y_pred_prescription = mlb_prescription.inverse_transform(y_pred_prescription_encoded)

# Calculate accuracy manually
accuracy_disease = sum(set(pred) == set(true) for pred, true in zip(y_pred_disease, y_test_disease)) / len(y_test_disease)
accuracy_prescription = sum(set(pred) == set(true) for pred, true in zip(y_pred_prescription, y_test_prescription)) / len(y_test_prescription)

print(f"Accuracy - Disease: {accuracy_disease}")
print(f"Accuracy - Prescription: {accuracy_prescription}")

# Step 6: Model Improvement
# To improve the model, you can perform hyperparameter tuning, feature selection, or use


Accuracy - Disease: 0.0
Accuracy - Prescription: 0.0


In [74]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Step 1: Load and preprocess the data
data = pd.read_csv('data.csv')
X = data[['Location', 'Gender', 'Age', 'BMI', 'Allergy', 'Symptoms', 'Duration of Symptoms', 'Current Disease Info', 'Occupation', 'Smoker', 'BP Range', 'Temp Range']]
y = data[['Disease', 'Prescription']]

# Encoding categorical features
encoder = LabelEncoder()
for column in X.columns:
    if X[column].dtype == object:
        X[column] = encoder.fit_transform(X[column])

# One-hot encoding the target variable
y_encoded = pd.get_dummies(y)

# Step 2: Define the model
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X.shape[1]))
model.add(Dense(32, activation='relu'))
model.add(Dense(y_encoded.shape[1], activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 3: Model Training
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.1, random_state=42)

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Step 4: Model Evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# Step 5: Save the model
model.save('model.h5')
print("Model saved successfully.")

# Step 6: Load the saved model
from tensorflow.keras.models import load_model

loaded_model = load_model('model.h5')
print("Model loaded successfully.")

# Step 7: Perform predictions using the loaded model
# Assuming you have new data for prediction stored in a DataFrame called 'new_data'
# Preprocess the new data similar to how you preprocessed the training data

# Encoding categorical features
for column in new_data.columns:
    if new_data[column].dtype == object:
        new_data[column] = encoder.transform(new_data[column])

# Make predictions using the loaded model
predictions = loaded_model.predict(new_data)

# Convert the predicted probabilities back to categorical labels
predicted_labels = y_encoded.columns[np.argmax(predictions, axis=1)]
print("Predicted labels:", predicted_labels)


Epoch 1/10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = encoder.fit_transform(X[column])


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 8.95175552368164
Test Accuracy: 0.2222222238779068
Model saved successfully.
Model loaded successfully.


ValueError: y contains previously unseen labels: 'Kozhikode'