In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model

In [2]:
# loading the data from the csv file to apandas dataframe
rec_data = pd.read_csv('healthrec.csv', encoding= 'unicode_escape')
rec_data

Unnamed: 0,Body weight in range or not,Increased Total Fat (%kcal)?,Reduce Saturated Fatty Acids (%kcal),Increase Polyunsaturated Fatty Acids (%kcal),Limit Cholesterol (mg/day),Limit Simple Sugars,Increase Fiber,Restrict Sodium Chloride(g),Moderate Alcohol Intake,Other Recommendations,Disease,Age
0,No,Yes,Yes,Yes,250-350,Yes,Yes,8,Yes,Reduce additives processed foods,No Disease,69
1,No,No,No,No,No,Yes,NC,12,Yes,Consider high-risk groups,No Disease,69
2,No,Yes,Yes,NS,Yes,Yes,NS,Yes,Yes,"More fish, poultry legumes; less red meat",No Disease,69
3,No,Yes,No,No,No,Yes,No,3 to 8,For weight reduction only,Variety in diet; consider high-risk groups,No Disease,78
4,No,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Variety in diet; consider high-risk groups,No Disease,68
5,No,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Fluoridation of water; adolescent girls and wo...,No Disease,84
6,Yes,Yes,Yes,Yes,<300,Yes,"Directly through vegetables, fruits, and cereals",?6 g/day with a goal of 4.5 g/day,"If you drink, limit to < 1.0 oz alcohol or <2 ...","Avoid dietary supplements, especially in exces...",No Disease,68
7,No,Yes,Yes,Yes,<250,No,No,5 g/day,No,NS,Heart Disease,67
8,No,Yes,Yes,Yes,<300,No,No,?3 g/day,1-2 oz,Protein to make up remainder of calories; wide...,Heart Disease,75
9,No,Yes,Yes,No,Increase,No,Decrease,No,No,"10-12% of calories from protein, of which 30-5...",Heart Disease,74


In [3]:
selected_features = ['Body weight in range or not','Age','Disease','Increased Total Fat (%kcal)?']
print(selected_features)

['Body weight in range or not', 'Age', 'Disease', 'Increased Total Fat (%kcal)?']


In [4]:
# replacing the null valuess with null string

for feature in selected_features:
  rec_data[feature] = rec_data[feature].fillna('')

In [5]:
rec_data['combined_features'] = rec_data[selected_features].astype(str).apply(' '.join, axis=1)


In [6]:
print(rec_data['combined_features'])

0                    No 69 No Disease Yes
1                     No 69 No Disease No
2                    No 69 No Disease Yes
3                    No 78 No Disease Yes
4                    No 68 No Disease Yes
5                    No 84 No Disease Yes
6                   Yes 68 No Disease Yes
7                 No 67 Heart Disease Yes
8                 No 75 Heart Disease Yes
9                 No 74 Heart Disease Yes
10                No 74 Heart Disease Yes
11                No 62 Heart Disease Yes
12                 No 74 Heart Disease 35
13       No 74 Heart Disease Reduce to 35
14    No 68 Heart Disease Reduce to 30-35
15                 No 63 Heart Disease 30
16    No 61 Heart Disease Reduce to 20-30
17              No 75 Heart Disease 20-25
18       No 79 Heart Disease Reduce to 35
19                No 82 Heart Disease <30
20                No 84 Heart Disease <30
21                No 87 Heart Disease <30
22               Yes 77 Heart Disease <30
23               Yes 61 Heart Dise

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
vectorizer = TfidfVectorizer()

In [9]:
feature_vectors = vectorizer.fit_transform(rec_data['combined_features'].values.tolist())


In [10]:
print(feature_vectors)

  (0, 32)	0.3460985325880741
  (0, 24)	0.27244367523388385
  (0, 11)	0.6833321234241005
  (0, 29)	0.5822778191397521
  (1, 24)	0.23857984383922723
  (1, 11)	0.5983962415603674
  (1, 29)	0.7648539705068388
  (2, 32)	0.3460985325880741
  (2, 24)	0.27244367523388385
  (2, 11)	0.6833321234241005
  (2, 29)	0.5822778191397521
  (3, 17)	0.7542650149752966
  (3, 32)	0.31123895181057815
  (3, 24)	0.24500272588020053
  (3, 29)	0.5236298944592825
  (4, 10)	0.6833321234241005
  (4, 32)	0.3460985325880741
  (4, 24)	0.27244367523388385
  (4, 29)	0.5822778191397521
  (5, 20)	0.715536382154221
  (5, 32)	0.33115108195238363
  (5, 24)	0.26067726190611784
  (5, 29)	0.5571301570194739
  (6, 10)	0.6500361876395688
  (6, 32)	0.6584691776053742
  :	:
  (25, 28)	0.4214966328005169
  (25, 5)	0.4214966328005169
  (26, 8)	0.6128894065428285
  (26, 23)	0.4076400113830983
  (26, 27)	0.4076400113830983
  (26, 2)	0.3062182529972019
  (26, 31)	0.366465315373747
  (26, 32)	0.252901901428432
  (27, 23)	0.46421821671151

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(feature_vectors)

In [12]:
print(similarity)

[[1.         0.91926045 1.         0.47936686 0.53305721 0.51003531
  0.45976761 0.34629328 0.3714534  0.41990693 0.41990693 0.3714534
  0.24007957 0.19072731 0.17568804 0.22652286 0.1652633  0.16817191
  0.17297146 0.22652286 0.24210363 0.22652286 0.17763214 0.18945477
  0.         0.         0.08752898 0.09967752 0.08752898 0.08752898]
 [0.91926045 1.         0.91926045 0.45895312 0.51035707 0.48831555
  0.27366095 0.27403548 0.29394566 0.33228885 0.33228885 0.29394566
  0.28334724 0.2251006  0.20735093 0.26734731 0.19504742 0.19848023
  0.20414476 0.26734731 0.28573608 0.26734731 0.05951254 0.0634735
  0.         0.         0.         0.         0.         0.        ]
 [1.         0.91926045 1.         0.47936686 0.53305721 0.51003531
  0.45976761 0.34629328 0.3714534  0.41990693 0.41990693 0.3714534
  0.24007957 0.19072731 0.17568804 0.22652286 0.1652633  0.16817191
  0.17297146 0.22652286 0.24210363 0.22652286 0.17763214 0.18945477
  0.         0.         0.08752898 0.09967752 0.0

In [13]:
print(similarity.shape)

(30, 30)


In [28]:
dis_name = input('Enter your disease : ')
bod=input("Is your body weight in range?: ")
age=input("what is your age: ")

Enter your disease : Heart Disease
Is your body weight in range?: No
what is your age: 62


In [29]:
list_of_all_recs = rec_data[['Age','Body weight in range or not','Disease']].values.tolist()
print(list_of_all_recs)

[[69, 'No', 'No Disease'], [69, 'No', 'No Disease'], [69, 'No', 'No Disease'], [78, 'No', 'No Disease'], [68, 'No', 'No Disease'], [84, 'No', 'No Disease'], [68, 'Yes', 'No Disease'], [67, 'No', 'Heart Disease'], [75, 'No', 'Heart Disease'], [74, 'No', 'Heart Disease'], [74, 'No', 'Heart Disease'], [62, 'No', 'Heart Disease'], [74, 'No', 'Heart Disease'], [74, 'No', 'Heart Disease'], [68, 'No', 'Heart Disease'], [63, 'No', 'Heart Disease'], [61, 'No', 'Heart Disease'], [75, 'No', 'Heart Disease'], [79, 'No', 'Heart Disease'], [82, 'No', 'Heart Disease'], [84, 'No', 'Heart Disease'], [87, 'No', 'Heart Disease'], [77, 'Yes', 'Heart Disease'], [61, 'Yes', 'Heart Disease'], [65, 'NC', 'Liver Cancer'], [62, 'NC', 'Liver Cancer'], [66, 'Yes', 'Liver Cancer'], [74, 'Yes', 'Liver Cancer'], [73, 'Yes', 'Liver Cancer'], [71, 'Yes', 'Liver Cancer']]


In [31]:
import difflib

find_close_match = difflib.get_close_matches(dis_name, [record[2] for record in list_of_all_recs])
find_close_match_bod = difflib.get_close_matches(bod, [record[1] for record in list_of_all_recs])
find_close_match_age = difflib.get_close_matches(str(age), [str(record[0]) for record in list_of_all_recs])

# Find records corresponding to close matches
records_for_close_matches = []
for record in list_of_all_recs:
    if (record[2] in find_close_match) and (record[1] in find_close_match_bod or str(record[0]) in find_close_match_age):
        records_for_close_matches.append(record)

print("Records for close matches:", records_for_close_matches)


Records for close matches: [[67, 'No', 'Heart Disease'], [75, 'No', 'Heart Disease'], [74, 'No', 'Heart Disease'], [74, 'No', 'Heart Disease'], [62, 'No', 'Heart Disease'], [74, 'No', 'Heart Disease'], [74, 'No', 'Heart Disease'], [68, 'No', 'Heart Disease'], [63, 'No', 'Heart Disease'], [61, 'No', 'Heart Disease'], [75, 'No', 'Heart Disease'], [79, 'No', 'Heart Disease'], [82, 'No', 'Heart Disease'], [84, 'No', 'Heart Disease'], [87, 'No', 'Heart Disease']]


In [35]:
close_match =  records_for_close_matches[4]

In [36]:
print(close_match)

[62, 'No', 'Heart Disease']


In [37]:
index_of_the_rec = rec_data[
        (rec_data['Age'] == close_match[0]) & 
        (rec_data['Body weight in range or not'] == close_match[1]) & 
        (rec_data['Disease'] == close_match[2])
    ].index[0]
print(index_of_the_rec)

11


In [38]:
similarity_score = list(enumerate(similarity[index_of_the_rec]))
print(similarity_score)

[(0, 0.37145339739921845), (1, 0.2939456623268836), (2, 0.37145339739921845), (3, 0.3340400353289293), (4, 0.37145339739921845), (5, 0.35541091007757275), (6, 0.3873956700723439), (7, 0.38792585117124134), (8, 0.41611079856269306), (9, 0.47038958025558186), (10, 0.47038958025558186), (11, 1.0), (12, 0.28055649808374566), (13, 0.22288353883046358), (14, 0.20530868762159007), (15, 0.2647141458740305), (16, 0.19312635833578884), (17, 0.1965253536565247), (18, 0.20213409603068133), (19, 0.2647141458740305), (20, 0.28292180119677135), (21, 0.2647141458740305), (22, 0.29356378390351756), (23, 0.3131024489537815), (24, 0.0), (25, 0.3220767173927209), (26, 0.08943587778628158), (27, 0.10184908874648836), (28, 0.08943587778628158), (29, 0.08943587778628158)]


In [39]:
sorted_similar_recs = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_recs)

[(11, 1.0), (9, 0.47038958025558186), (10, 0.47038958025558186), (8, 0.41611079856269306), (7, 0.38792585117124134), (6, 0.3873956700723439), (0, 0.37145339739921845), (2, 0.37145339739921845), (4, 0.37145339739921845), (5, 0.35541091007757275), (3, 0.3340400353289293), (25, 0.3220767173927209), (23, 0.3131024489537815), (1, 0.2939456623268836), (22, 0.29356378390351756), (20, 0.28292180119677135), (12, 0.28055649808374566), (15, 0.2647141458740305), (19, 0.2647141458740305), (21, 0.2647141458740305), (13, 0.22288353883046358), (14, 0.20530868762159007), (18, 0.20213409603068133), (17, 0.1965253536565247), (16, 0.19312635833578884), (27, 0.10184908874648836), (26, 0.08943587778628158), (28, 0.08943587778628158), (29, 0.08943587778628158), (24, 0.0)]


In [40]:
# print recommendations based on the index

print('Diet suggested for you : \n')

i = 1

for rec in sorted_similar_recs:
  index = rec[0]
  rec_from_index = rec_data[rec_data.index==index]['Other Recommendations'].values[0]
  if (i<3):
    print(i, '.',rec_from_index)
    i+=1

Diet suggested for you : 

1 . NC
2 . 10-12% of calories from protein, of which 30-50% should be of animal origin


In [58]:

# Get input from user
dis_name = input('Enter your disease: ')
bod = input("Is your body weight in range? (Yes/No): ")
age = input("What is your age: ")

# Find close matches for disease, body weight, and age
list_of_all_recs = rec_data[['Age', 'Body weight in range or not', 'Disease']].values.tolist()

find_close_match = difflib.get_close_matches(dis_name, [record[2] for record in list_of_all_recs])
find_close_match_bod = difflib.get_close_matches(bod, [record[1] for record in list_of_all_recs])
find_close_match_age = difflib.get_close_matches(str(age), [str(record[0]) for record in list_of_all_recs])

records_for_close_matches = []

for record in list_of_all_recs:
    if (record[2] in find_close_match) and (record[1] in find_close_match_bod or str(record[0]) in find_close_match_age):
        records_for_close_matches.append(record)

print("Records for close matches:")

# Get index of the matched record
index_of_the_rec = rec_data[
    (rec_data['Age'] == records_for_close_matches[0][0]) &
    (rec_data['Body weight in range or not'] == records_for_close_matches[0][1]) &
    (rec_data['Disease'] == records_for_close_matches[0][2])
].index[0]

# Calculate similarity score

feature_vectors = vectorizer.fit_transform(rec_data['combined_features'].values.tolist())
similarity = cosine_similarity(feature_vectors)
similarity_score = list(enumerate(similarity[index_of_the_rec]))
sorted_similar_recs = sorted(similarity_score, key=lambda x: x[1], reverse=True)

# Print recommendations
print('Diet suggested for you:')
for i, rec in enumerate(sorted_similar_recs[:2], 1):
    index = rec[0]
    rec_from_index = rec_data.loc[index, 'Other Recommendations']
    print(i, '.', rec_from_index)


Enter your disease: no disease
Is your body weight in range? (Yes/No): yes
What is your age: 62
Records for close matches:
Diet suggested for you:
1 . Avoid dietary supplements, especially in excess of RDAs; drink fluoridated water; limit protein intake to moderate levels (less than twice the RDA)
2 . Variety in diet; consider high-risk groups
