In [23]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [24]:
vitamin_diseases = {
    "Vitamin A": ["Night blindness", "Xerophthalmia", "Keratomalacia", "Bitot's spots", "Hyperkeratosis", "Infectious diseases susceptibility", "Impaired immune function", "Dry skin", "Blindness", "Infertility"],
    "Vitamin B1": ["Beriberi", "Wernicke-Korsakoff syndrome", "Confusion", "Muscle weakness", "Peripheral neuropathy", "Cardiovascular problems", "Edema", "Nystagmus", "Ataxia", "Memory loss"],
    "Vitamin B2": ["Ariboflavinosis", "Migraine headaches", "Sore throat", "Red, itchy eyes", "Swollen and cracked lips", "Inflamed tongue", "Cheilosis", "Magenta tongue", "Scrotal dermatitis", "Peripheral neuropathy"],
    "Vitamin B3": ["Pellagra", "Dermatitis", "Diarrhea", "Dementia", "Inflammation of the mouth", "Bright red tongue", "Pigmented rash", "Vomiting", "Hyperpigmentation", "Weakness"],
    "Vitamin B5": ["Paresthesia", "Fatigue", "Irritability", "Sleep disturbances", "Stomach pains", "Vomiting", "Burning feet syndrome", "Upper respiratory infections", "Muscle cramps", "Tingling sensations"],
    "Vitamin B6": ["Anemia", "Dermatitis", "Neuropathy", "Seizures", "Depression", "Confusion", "Weakened immune function", "Cognitive decline", "Microcytic anemia", "Homocysteine buildup"],
    "Vitamin B7": ["Dermatitis", "Enteritis", "Alopecia", "Brittle nails", "Conjunctivitis", "Depression", "Hallucinations", "Insomnia", "Seborrheic dermatitis", "Thinning hair"],
    "Vitamin B9": ["Megaloblastic Anemia", "Neural Tube Defects", "Low birth weight", "Preterm birth", "Fetal growth restriction", "Developmental delay", "Spina bifida", "Anencephaly", "Cleft palate", "Heart defects"],
    "Vitamin B12": ["Pernicious Anemia", "Fatigue", "Shortness of breath", "Pale or jaundiced skin", "Smooth tongue", "Difficulty walking", "Numbness and tingling in hands and feet", "Cognitive changes", "Memory loss", "Depression"],
    "Vitamin C": ["Scurvy", "Fatigue", "Muscle pain", "Swollen, bleeding gums", "Joint pain", "Anemia", "Dry and scaly skin", "Weak immune system", "Bruising easily", "Slow wound healing"],
    "Vitamin D": ["Rickets (in children)", "Osteomalacia (in adults)", "Muscle weakness", "Bone pain", "Soft bones", "Fractures", "Delayed tooth formation", "Impaired growth", "Poor immune function", "Mood disturbances"],
    "Vitamin E": ["Hemolytic Anemia", "Nerve Damage", "Muscle weakness", "Vision problems", "Impaired coordination", "Tremors", "Ataxia", "Peripheral neuropathy", "Retinopathy", "Immune system impairment"],
    "Vitamin K": ["Bleeding disorders", "Easy bruising", "Heavy menstrual bleeding", "Blood in urine or stool", "Nosebleeds", "Hemorrhagic disease of the newborn", "Osteoporosis", "Impaired wound healing", "Hematoma", "Gastrointestinal bleeding"]
}

vitamin_names = {vitamin: vitamin.replace(' ', '-').lower() for vitamin in vitamin_diseases.keys()}

# Create a DataFrame from the dictionary
df = pd.DataFrame([(vitamin_names[vitamin], disease) for vitamin, diseases in vitamin_diseases.items() for disease in diseases], columns=['Vitamin', 'Disease'])

print(df)

       Vitamin                             Disease
0    vitamin-a                     Night blindness
1    vitamin-a                       Xerophthalmia
2    vitamin-a                       Keratomalacia
3    vitamin-a                       Bitot's spots
4    vitamin-a                      Hyperkeratosis
..         ...                                 ...
125  vitamin-k  Hemorrhagic disease of the newborn
126  vitamin-k                        Osteoporosis
127  vitamin-k              Impaired wound healing
128  vitamin-k                            Hematoma
129  vitamin-k           Gastrointestinal bleeding

[130 rows x 2 columns]


In [25]:
df.Vitamin.unique()

array(['vitamin-a', 'vitamin-b1', 'vitamin-b2', 'vitamin-b3',
       'vitamin-b5', 'vitamin-b6', 'vitamin-b7', 'vitamin-b9',
       'vitamin-b12', 'vitamin-c', 'vitamin-d', 'vitamin-e', 'vitamin-k'],
      dtype=object)

In [26]:
age_groups = {
    "Night blindness": "Children",
    "Xerophthalmia": "Children",
    "Keratomalacia": "Children",
    "Bitot's spots": "Children",
    "Hyperkeratosis": "Adults",
    "Infectious diseases susceptibility": "Adults",
    "Impaired immune function": "Adults",
    "Dry skin": "Adults",
    "Blindness": "Adults",
    "Infertility": "Adults",
    "Beriberi": "Adults",
    "Wernicke-Korsakoff syndrome": "Adults",
    "Confusion": "Adults",
    "Muscle weakness": "Adults",
    "Peripheral neuropathy": "Adults",
    "Cardiovascular problems": "Adults",
    "Edema": "Adults",
    "Nystagmus": "Adults",
    "Ataxia": "Adults",
    "Memory loss": "Adults",
    "Ariboflavinosis": "Adults",
    "Migraine headaches": "Adults",
    "Sore throat": "Adults",
    "Red, itchy eyes": "Adults",
    "Swollen and cracked lips": "Adults",
    "Inflamed tongue": "Adults",
    "Cheilosis": "Adults",
    "Magenta tongue": "Adults",
    "Scrotal dermatitis": "Adults",
    "Dermatitis": "Adults",
    "Enteritis": "Adults",
    "Alopecia": "Adults",
    "Brittle nails": "Adults",
    "Conjunctivitis": "Adults",
    "Depression": "Adults",
    "Hallucinations": "Adults",
    "Insomnia": "Adults",
    "Seborrheic dermatitis": "Adults",
    "Thinning hair": "Adults",
    "Pellagra": "Adults",
    "Diarrhea": "Adults",
    "Dementia": "Adults",
    "Inflammation of the mouth": "Adults",
    "Bright red tongue": "Adults",
    "Pigmented rash": "Adults",
    "Vomiting": "Adults",
    "Hyperpigmentation": "Adults",
    "Weakness": "Adults",
    "Paresthesia": "Adults",
    "Fatigue": "Adults",
    "Irritability": "Adults",
    "Sleep disturbances": "Adults",
    "Stomach pains": "Adults",
    "Burning feet syndrome": "Adults",
    "Upper respiratory infections": "Adults",
    "Muscle cramps": "Adults",
    "Tingling sensations": "Adults",
    "Anemia": "Adults",
    "Neuropathy": "Adults",
    "Seizures": "Adults",
    "Depression": "Adults",
    "Confusion": "Adults",
    "Weakened immune function": "Adults",
    "Cognitive decline": "Adults",
    "Microcytic anemia": "Adults",
    "Homocysteine buildup": "Adults",
    "Dermatitis": "Adults",
    "Enteritis": "Adults",
    "Alopecia": "Adults",
    "Brittle nails": "Adults",
    "Conjunctivitis": "Adults",
    "Depression": "Adults",
    "Hallucinations": "Adults",
    "Insomnia": "Adults",
    "Seborrheic dermatitis": "Adults",
    "Thinning hair": "Adults",
    "Megaloblastic Anemia": "Pregnant Women",
    "Neural Tube Defects": "Pregnant Women",
    "Low birth weight": "Infants",
    "Preterm birth": "Infants",
    "Fetal growth restriction": "Infants",
    "Developmental delay": "Children",
    "Spina bifida": "Children",
    "Anencephaly": "Children",
    "Cleft palate": "Children",
    "Heart defects": "Children",
    "Pernicious Anemia": "Elderly",
    "Shortness of breath": "Elderly",
    "Pale or jaundiced skin": "Elderly",
    "Smooth tongue": "Elderly",
    "Difficulty walking": "Elderly",
    "Numbness and tingling in hands and feet": "Elderly",
    "Cognitive changes": "Elderly",
    "Memory loss": "Elderly",
    "Depression": "Elderly",
    "Scurvy": "Adults",
    "Muscle pain": "Adults",
    "Swollen, bleeding gums": "Adults",
    "Joint pain": "Adults",
    "Anemia": "Adults",
    "Muscle pain": "Adults",
    "Swollen, bleeding gums": "Adults",
    "Joint pain": "Adults",
    "Dry and scaly skin": "Adults",
    "Weak immune system": "Adults",
    "Bruising easily": "Adults",
    "Slow wound healing": "Adults",
    "Rickets (in children)": "Children",
    "Osteomalacia (in adults)": "Adults",
    "Muscle weakness": "Adults",
    "Bone pain": "Adults",
    "Soft bones": "Children",
    "Fractures": "Elderly",
    "Delayed tooth formation": "Children",
    "Impaired growth": "Children",
    "Poor immune function": "Adults",
    "Mood disturbances": "Adults",
    "Hemolytic Anemia": "Adults",
    "Nerve Damage": "Adults",
    "Vision problems": "Adults",
    "Impaired coordination": "Adults",
    "Tremors": "Elderly",
    "Ataxia": "Elderly",
    "Peripheral neuropathy": "Adults",
    "Retinopathy": "Adults",
    "Immune system impairment": "Adults",
    "Bleeding disorders": "Adults",
    "Easy bruising": "Adults",
    "Heavy menstrual bleeding": "Women",
    "Blood in urine or stool": "Adults",
    "Nosebleeds": "Adults",
    "Hemorrhagic disease of the newborn": "Newborns",
    "Osteoporosis": "Elderly",
    "Impaired wound healing": "Adults",
    "Hematoma": "Adults",
    "Gastrointestinal bleeding": "Adults"
}

# Create a DataFrame from the age_groups dictionary
age_groups_df = pd.DataFrame(list(age_groups.items()), columns=['Disease', 'Age Group'])
age_groups_df

Unnamed: 0,Disease,Age Group
0,Night blindness,Children
1,Xerophthalmia,Children
2,Keratomalacia,Children
3,Bitot's spots,Children
4,Hyperkeratosis,Adults
...,...,...
110,Hemorrhagic disease of the newborn,Newborns
111,Osteoporosis,Elderly
112,Impaired wound healing,Adults
113,Hematoma,Adults


In [27]:
temp = df.merge(age_groups_df, on = 'Disease')
temp


Unnamed: 0,Vitamin,Disease,Age Group
0,vitamin-a,Night blindness,Children
1,vitamin-a,Xerophthalmia,Children
2,vitamin-a,Keratomalacia,Children
3,vitamin-a,Bitot's spots,Children
4,vitamin-a,Hyperkeratosis,Adults
...,...,...,...
125,vitamin-k,Hemorrhagic disease of the newborn,Newborns
126,vitamin-k,Osteoporosis,Elderly
127,vitamin-k,Impaired wound healing,Adults
128,vitamin-k,Hematoma,Adults


In [28]:
df.Vitamin.unique()

array(['vitamin-a', 'vitamin-b1', 'vitamin-b2', 'vitamin-b3',
       'vitamin-b5', 'vitamin-b6', 'vitamin-b7', 'vitamin-b9',
       'vitamin-b12', 'vitamin-c', 'vitamin-d', 'vitamin-e', 'vitamin-k'],
      dtype=object)

In [29]:
gender_effect = {
    "Vitamin A": "Female",
    "Vitamin B1": "Male",
    "Vitamin B2": "Male",
    "Vitamin B3": "Male",
    "Vitamin B5": "Male",
    "Vitamin B6": "Male",
    "Vitamin B7": "Female",
    "Vitamin B9": "Female",
    "Vitamin B12": "Male",
    "Vitamin C": "Female",
    "Vitamin D": "Male",
    "Vitamin E": "Male",
    "Vitamin K": "Female",
}

vitamin_names = {vitamin: vitamin.replace(' ', '-').lower() for vitamin in gender_effect.keys()}

# Create a DataFrame from the dictionary
gender_effect_df = pd.DataFrame(list(zip(vitamin_names.values(), gender_effect.values())), columns=['Vitamin', 'Gender'])

gender_effect_df

Unnamed: 0,Vitamin,Gender
0,vitamin-a,Female
1,vitamin-b1,Male
2,vitamin-b2,Male
3,vitamin-b3,Male
4,vitamin-b5,Male
5,vitamin-b6,Male
6,vitamin-b7,Female
7,vitamin-b9,Female
8,vitamin-b12,Male
9,vitamin-c,Female


In [44]:
final = temp.merge(gender_effect_df,on = 'Vitamin')
final

Unnamed: 0,Vitamin,Disease,Age Group,Gender
0,vitamin-a,Night blindness,Children,Female
1,vitamin-a,Xerophthalmia,Children,Female
2,vitamin-a,Keratomalacia,Children,Female
3,vitamin-a,Bitot's spots,Children,Female
4,vitamin-a,Hyperkeratosis,Adults,Female
...,...,...,...,...
125,vitamin-k,Hemorrhagic disease of the newborn,Newborns,Female
126,vitamin-k,Osteoporosis,Elderly,Female
127,vitamin-k,Impaired wound healing,Adults,Female
128,vitamin-k,Hematoma,Adults,Female


In [45]:
vitamins_list = gender_effect_df.Vitamin.to_list()
print(vitamins_list)

['vitamin-a', 'vitamin-b1', 'vitamin-b2', 'vitamin-b3', 'vitamin-b5', 'vitamin-b6', 'vitamin-b7', 'vitamin-b9', 'vitamin-b12', 'vitamin-c', 'vitamin-d', 'vitamin-e', 'vitamin-k']


In [46]:
vitamins_links = {
    'vitamin-a': ['https://www.hsph.harvard.edu/nutritionsource/vitamin-a'],
    'vitamin-b1': ['https://www.hsph.harvard.edu/nutritionsource/vitamin-b1'],
    'vitamin-b2': ['https://www.hsph.harvard.edu/nutritionsource/riboflavin-vitamin-b2'],
    'vitamin-b3': ['https://www.hsph.harvard.edu/nutritionsource/niacin-vitamin-b3'],
    'vitamin-b5': ['https://www.hsph.harvard.edu/nutritionsource/pantothenic-acid-vitamin-b5'],
    'vitamin-b6': ['https://www.hsph.harvard.edu/nutritionsource/vitamin-b6'],
    'vitamin-b7': ['https://www.hsph.harvard.edu/nutritionsource/biotin-vitamin-b7'],
    'vitamin-b9': ['https://www.hsph.harvard.edu/nutritionsource/folic-acid'],
    'vitamin-b12': ['https://www.hsph.harvard.edu/nutritionsource/vitamin-b12'],
    'vitamin-c': ['https://www.hsph.harvard.edu/nutritionsource/vitamin-c'],
    'vitamin-d': ['https://www.hsph.harvard.edu/nutritionsource/vitamin-d'],
    'vitamin-e': ['https://www.hsph.harvard.edu/nutritionsource/vitamin-e'],
    'vitamin-k': ['https://www.hsph.harvard.edu/nutritionsource/vitamin-k']
}

# Print the filtered dictionary
vitamins_links

{'vitamin-a': ['https://www.hsph.harvard.edu/nutritionsource/vitamin-a'],
 'vitamin-b1': ['https://www.hsph.harvard.edu/nutritionsource/vitamin-b1'],
 'vitamin-b2': ['https://www.hsph.harvard.edu/nutritionsource/riboflavin-vitamin-b2'],
 'vitamin-b3': ['https://www.hsph.harvard.edu/nutritionsource/niacin-vitamin-b3'],
 'vitamin-b5': ['https://www.hsph.harvard.edu/nutritionsource/pantothenic-acid-vitamin-b5'],
 'vitamin-b6': ['https://www.hsph.harvard.edu/nutritionsource/vitamin-b6'],
 'vitamin-b7': ['https://www.hsph.harvard.edu/nutritionsource/biotin-vitamin-b7'],
 'vitamin-b9': ['https://www.hsph.harvard.edu/nutritionsource/folic-acid'],
 'vitamin-b12': ['https://www.hsph.harvard.edu/nutritionsource/vitamin-b12'],
 'vitamin-c': ['https://www.hsph.harvard.edu/nutritionsource/vitamin-c'],
 'vitamin-d': ['https://www.hsph.harvard.edu/nutritionsource/vitamin-d'],
 'vitamin-e': ['https://www.hsph.harvard.edu/nutritionsource/vitamin-e'],
 'vitamin-k': ['https://www.hsph.harvard.edu/nutriti

In [47]:
urls = pd.DataFrame(vitamins_links).transpose()
urls = urls.reset_index().rename(columns={'index': 'Vitamin',
                                   0:'links'})
urls

Unnamed: 0,Vitamin,links
0,vitamin-a,https://www.hsph.harvard.edu/nutritionsource/v...
1,vitamin-b1,https://www.hsph.harvard.edu/nutritionsource/v...
2,vitamin-b2,https://www.hsph.harvard.edu/nutritionsource/r...
3,vitamin-b3,https://www.hsph.harvard.edu/nutritionsource/n...
4,vitamin-b5,https://www.hsph.harvard.edu/nutritionsource/p...
5,vitamin-b6,https://www.hsph.harvard.edu/nutritionsource/v...
6,vitamin-b7,https://www.hsph.harvard.edu/nutritionsource/b...
7,vitamin-b9,https://www.hsph.harvard.edu/nutritionsource/f...
8,vitamin-b12,https://www.hsph.harvard.edu/nutritionsource/v...
9,vitamin-c,https://www.hsph.harvard.edu/nutritionsource/v...


In [48]:
final = final.merge(urls,on= 'Vitamin')
final

Unnamed: 0,Vitamin,Disease,Age Group,Gender,links
0,vitamin-a,Night blindness,Children,Female,https://www.hsph.harvard.edu/nutritionsource/v...
1,vitamin-a,Xerophthalmia,Children,Female,https://www.hsph.harvard.edu/nutritionsource/v...
2,vitamin-a,Keratomalacia,Children,Female,https://www.hsph.harvard.edu/nutritionsource/v...
3,vitamin-a,Bitot's spots,Children,Female,https://www.hsph.harvard.edu/nutritionsource/v...
4,vitamin-a,Hyperkeratosis,Adults,Female,https://www.hsph.harvard.edu/nutritionsource/v...
...,...,...,...,...,...
125,vitamin-k,Hemorrhagic disease of the newborn,Newborns,Female,https://www.hsph.harvard.edu/nutritionsource/v...
126,vitamin-k,Osteoporosis,Elderly,Female,https://www.hsph.harvard.edu/nutritionsource/v...
127,vitamin-k,Impaired wound healing,Adults,Female,https://www.hsph.harvard.edu/nutritionsource/v...
128,vitamin-k,Hematoma,Adults,Female,https://www.hsph.harvard.edu/nutritionsource/v...


In [87]:
final[final.Vitamin == 'vitamin-k']['links'].iloc[0]

'https://www.hsph.harvard.edu/nutritionsource/vitamin-k'

In [88]:
vitamin = input()

In [89]:
url = final[final.Vitamin == vitamin]['links'].iloc[0]
response = requests.get(url)
if response.status_code == 200:
    # Store the HTML content in a variable
    webpage_content = response.text
    print("Webpage content stored in 'webpage_content' variable.")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
#-------------------------------
soup = BeautifulSoup(webpage_content, 'html.parser')
food_sources_heading = soup.find('h3', string='Food Sources')
if food_sources_heading:
    ul_tag = food_sources_heading.find_next('ul')
    if ul_tag:
        list_items = ul_tag.find_all('li')
        food_items = [item.get_text(strip=True) for item in list_items]
        print(*food_items,sep = '\n')
    else:
        print("No unordered list found after 'Food Sources' heading.")
else:
    print("Food Sources heading not found.")

Webpage content stored in 'webpage_content' variable.
PhylloquinoneGreen leafyvegetablesincluding collard and turnip greens,kale, spinach, broccoli,Brussels sprouts, cabbage, lettucesSoybean and canola oilSalad dressings made with soybean or canola oilFortified meal replacement shakes
Green leafyvegetablesincluding collard and turnip greens,kale, spinach, broccoli,Brussels sprouts, cabbage, lettuces
Soybean and canola oil
Salad dressings made with soybean or canola oil
Fortified meal replacement shakes
MenaquinonesNatto (fermentedsoybeans)Smaller amounts in meat,cheese,eggs
Natto (fermentedsoybeans)
Smaller amounts in meat,cheese,eggs


In [132]:
food_items

['PhylloquinoneGreen leafyvegetablesincluding collard and turnip greens,kale, spinach, broccoli,Brussels sprouts, cabbage, lettucesSoybean and canola oilSalad dressings made with soybean or canola oilFortified meal replacement shakes',
 'Green leafyvegetablesincluding collard and turnip greens,kale, spinach, broccoli,Brussels sprouts, cabbage, lettuces',
 'Soybean and canola oil',
 'Salad dressings made with soybean or canola oil',
 'Fortified meal replacement shakes',
 'MenaquinonesNatto (fermentedsoybeans)Smaller amounts in meat,cheese,eggs',
 'Natto (fermentedsoybeans)',
 'Smaller amounts in meat,cheese,eggs']

In [90]:
import requests
from bs4 import BeautifulSoup

vitamin = input()
url = final[final.Vitamin == vitamin]['links'].iloc[0]
response = requests.get(url)

if response.status_code == 200:
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract text content without tags
    text_content = soup.get_text(separator=' ', strip=True)
    
    # Print the extracted text
    print(text_content)
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


Vitamin K | The Nutrition Source | Harvard T.H. Chan School of Public Health Menu Close Menu Skip to content Information For: Prospective Students Current Students Alumni Faculty & Staff Friends & Supporters Search for: Harvard T.H. Chan School of Public Health Email People Departments Calendar Careers my.harvard Giving About Faculty & Research Admissions & Aid Academics Executive/Continuing Ed News The Nutrition Source Home > The Nutrition Source > Vitamin K The Nutrition Source Menu Search for: Home Nutrition News What Should I Eat? Healthy Eating Plate & Pyramid Healthy Eating Plate Translations Kid’s Healthy Eating Plate Whole Grains Protein Vegetables and Fruits Fats and Cholesterol Types of Fat Cholesterol Dietary Fat and Disease Vitamins and Minerals Healthy Drinks Water Other Healthy Beverage Options Drinks to Consume in Moderation Sugary Drinks Sports Drinks Energy Drinks Public Health Concerns: Sugary Drinks Low-Calorie Sweeteners Salt and Sodium Take Action: How to Reduce Yo

In [97]:
final.Vitamin.unique()

array(['vitamin-a', 'vitamin-b1', 'vitamin-b2', 'vitamin-b3',
       'vitamin-b5', 'vitamin-b6', 'vitamin-b7', 'vitamin-b9',
       'vitamin-b12', 'vitamin-c', 'vitamin-d', 'vitamin-e', 'vitamin-k'],
      dtype=object)

In [129]:
vitamin = input()
url = final[final.Vitamin == vitamin]['links'].iloc[0]
response = requests.get(url)

if response.status_code == 200:
    # Store the HTML content in a variable
    webpage_content = response.text
    print("Webpage content stored in 'webpage_content' variable.")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


Webpage content stored in 'webpage_content' variable.


In [130]:
from bs4 import BeautifulSoup

# Assuming you already have the 'webpage_content' variable with HTML content
soup = BeautifulSoup(webpage_content, 'html.parser')

# Find the <h3>Signs of Deficiency and Toxicity</h3> tag
deficiency_toxicity_tag = soup.find('h3', string='Signs of Deficiency and Toxicity')
deficiency_toxicity_tag_2 = soup.find('h3', string='Signs of Deficiency')
deficiency_toxicity_tag_3 = soup.find('h3', string='Signs of Deficiency and Toxicity ')

if deficiency_toxicity_tag:
    deficiency_toxicity_text = ''
    
    # Iterate over the next siblings until the next <h3> tag
    next_tag = deficiency_toxicity_tag.find_next_sibling()
    while next_tag and next_tag.name != 'h3':
        if next_tag.name == 'p':
            deficiency_toxicity_text += next_tag.get_text() + '\n'
        elif next_tag.name == 'ul':
            for li in next_tag.find_all('li'):
                deficiency_toxicity_text += f"- {li.get_text()}\n"
        
        next_tag = next_tag.find_next_sibling()

    print('Signs of Deficiency and Toxicity Text:')
    print(deficiency_toxicity_text.strip())
elif deficiency_toxicity_tag_2:
    deficiency_toxicity_text = ''
    
    # Iterate over the next siblings until the next <h3> tag
    next_tag = deficiency_toxicity_tag_2.find_next_sibling()
    while next_tag and next_tag.name != 'h3':
        if next_tag.name == 'p':
            deficiency_toxicity_text += next_tag.get_text() + '\n'
        elif next_tag.name == 'ul':
            for li in next_tag.find_all('li'):
                deficiency_toxicity_text += f"- {li.get_text()}\n"
        
        next_tag = next_tag.find_next_sibling()

    print('Signs of Deficiency and Toxicity Text:')
    print(deficiency_toxicity_text.strip())

elif deficiency_toxicity_tag_3:
    deficiency_toxicity_text = ''
    
    # Iterate over the next siblings until the next <h3> tag
    next_tag = deficiency_toxicity_tag_3.find_next_sibling()
    while next_tag and next_tag.name != 'h3':
        if next_tag.name == 'p':
            deficiency_toxicity_text += next_tag.get_text() + '\n'
        elif next_tag.name == 'ul':
            for li in next_tag.find_all('li'):
                deficiency_toxicity_text += f"- {li.get_text()}\n"
        
        next_tag = next_tag.find_next_sibling()

    print('Signs of Deficiency and Toxicity Text:')
    print(deficiency_toxicity_text.strip())

else:
    print('No <h3> tag found with text "Signs of Deficiency and Toxicity"')


Signs of Deficiency and Toxicity Text:
Vitamin K deficiency in adults is rare, but may occur in people taking medications that block vitamin K metabolism such as antibiotics, or in those with conditions that cause malabsorption of food and nutrients. A deficiency is also possible in newborn infants because vitamin K does not cross the placenta, and breast milk contains a low amount. The limited amount of blood clotting proteins at birth increases the risk of bleeding in infants if they are not given vitamin K supplements. The following are the most common signs of a deficiency.
- A longer time for blood to clot or a prolonged prothrombin time (as measured in a physician’s office)
- Bleeding
- Hemorrhaging
- Osteopenia or osteoporosis


In [136]:
deficiency_toxicity_text.split('\n')

['Vitamin K deficiency in adults is rare, but may occur in people taking medications that block vitamin K metabolism such as antibiotics, or in those with conditions that cause malabsorption of food and nutrients. A deficiency is also possible in newborn infants because vitamin K does not cross the placenta, and breast milk contains a low amount. The limited amount of blood clotting proteins at birth increases the risk of bleeding in infants if they are not given vitamin K supplements. The following are the most common signs of a deficiency.',
 '- A longer time for blood to clot or a prolonged prothrombin time (as measured in a physician’s office)',
 '- Bleeding',
 '- Hemorrhaging',
 '- Osteopenia or osteoporosis',
 '']

In [131]:
def get_deficiency_toxicity_text(soup, tags):
    for tag in tags:
        deficiency_toxicity_tag = soup.find('h3', string=tag)
        if deficiency_toxicity_tag:
            deficiency_toxicity_text = ''
            
            # Iterate over the next siblings until the next <h3> tag
            next_tag = deficiency_toxicity_tag.find_next_sibling()
            while next_tag and next_tag.name != 'h3':
                if next_tag.name == 'p':
                    deficiency_toxicity_text += next_tag.get_text() + '\n'
                elif next_tag.name == 'ul':
                    for li in next_tag.find_all('li'):
                        deficiency_toxicity_text += f"- {li.get_text()}\n"
                
                next_tag = next_tag.find_next_sibling()

            print(f'Signs of Deficiency and Toxicity Text (tag "{tag}"):')
            print(deficiency_toxicity_text.strip())
            return
    print('No matching <h3> tag found for Signs of Deficiency and Toxicity')

if __name__ == "__main__":
    vitamin = input()
    url = final[final.Vitamin == vitamin]['links'].iloc[0]
    response = requests.get(url)
    if response.status_code == 200:
        webpage_content = response.text
        print("------Webpage content stored in 'webpage_content' variable.------")

        soup = BeautifulSoup(webpage_content, 'html.parser')

        tags_to_search = ['Signs of Deficiency and Toxicity', 'Signs of Deficiency', 'Signs of Deficiency and Toxicity ']
        get_deficiency_toxicity_text(soup, tags_to_search)
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


Webpage content stored in 'webpage_content' variable.
Signs of Deficiency and Toxicity Text (tag "Signs of Deficiency and Toxicity "):
Because pantothenic acid is found in a wide variety of foods, a deficiency is rare except in people who have other nutrient deficiencies, as seen with severe malnutrition. Other rare cases are seen in persons with genetic mutations in which pantothenic acid cannot be metabolized.
Symptoms of deficiency may include:
- Headache
- Fatigue
- Irritability, restlessness
- Disturbed sleep
- Nausea, vomiting, stomach cramps
- Numbness or burning sensation in hands or feet
- Muscle cramps
A toxic level of pantothenic acid has not been observed from food sources. With very large daily doses of 10 grams a day, stomach upset or mild diarrhea has been reported. [2] However, this is rare and a Tolerable Upper Intake Level for pantothenic acid has not been established.


In [108]:
print(webpage_content)

<!DOCTYPE html>
<html lang="en-US" class="no-js">
<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1" />
	<link rel="profile" href="http://gmpg.org/xfn/11">
	<link rel="shortcut icon" type="image/x-icon" href="https://www.hsph.harvard.edu/nutritionsource/wp-content/themes/theme-main-2016/assets/images/favicon.ico" />
	<meta name="msapplication-TileColor" content="#A51C30">
	<meta name="msapplication-TileImage" content="https://www.hsph.harvard.edu/nutritionsource/wp-content/themes/theme-main-2016/assets/images/ms/ms-tile-icon.png">
	<link rel="apple-touch-icon" href="https://www.hsph.harvard.edu/nutritionsource/wp-content/themes/theme-main-2016/assets/images/ios/touch-icon-iphone.png">
	<link rel="apple-touch-icon" sizes="76x76" href="https://www.hsph.harvard.edu/nutritionsource/wp-content/themes/theme-main-2016/assets/images/ios/touch-icon-ipad.png">
	<link rel="apple-touch-icon" sizes="120x120" href="https://www.hsph.harvard.edu/nutriti

In [14]:
url = vitamins_links['vitamin-e']
response = requests.get(url)

if response.status_code == 200:
    # Store the HTML content in a variable
    webpage_content = response.text

soup = BeautifulSoup(webpage_content, 'html.parser')


In [15]:
from bs4 import BeautifulSoup

# Assuming you already have the 'webpage_content' variable with HTML content
soup = BeautifulSoup(webpage_content, 'html.parser')

# Find the <div> element with class "entry-content"
entry_content_div = soup.find('div', class_='entry-content')

if entry_content_div:
    # Extract text inside the <div> element
    main_content_text = entry_content_div.get_text(separator='\n').strip()

    # Remove the "Related" section and its content
    main_content_text = main_content_text.split("Related", 1)[0].strip()

    print('Main Content:')
    print(main_content_text)
else:
    print('No <div> element with class "entry-content" found')


Main Content:
Vitamin E is a fat-soluble vitamin with several forms, but alpha-tocopherol is the only one used by the human body. Its main role is to act as an antioxidant, scavenging loose electrons—so-called “free radicals”—that can damage cells. [1] It also enhances immune function and prevents clots from forming in heart arteries. Antioxidant vitamins, including vitamin E, came to public attention in the 1980s when scientists began to understand that free radical damage was involved in the early stages of artery-clogging atherosclerosis, and might also contribute to cancer, vision loss, and a host of other chronic conditions. Vitamin E has the ability to protect cells from free radical damage as well as reduce the production of free radicals in certain situations. However, conflicting study results have dimmed some of the promise of using high dose vitamin E to prevent chronic diseases.


Recommended Amounts


The Recommended Dietary Allowance (RDA) for vitamin E for males and fema

In [16]:
main_content_text

'Vitamin E is a fat-soluble vitamin with several forms, but alpha-tocopherol is the only one used by the human body. Its main role is to act as an antioxidant, scavenging loose electrons—so-called “free radicals”—that can damage cells. [1] It also enhances immune function and prevents clots from forming in heart arteries. Antioxidant vitamins, including vitamin E, came to public attention in the 1980s when scientists began to understand that free radical damage was involved in the early stages of artery-clogging atherosclerosis, and might also contribute to cancer, vision loss, and a host of other chronic conditions. Vitamin E has the ability to protect cells from free radical damage as\xa0well as reduce the production of free radicals in certain situations. However, conflicting study results have dimmed some of the promise of using high dose vitamin E to prevent chronic diseases.\n\n\nRecommended Amounts\n\n\nThe Recommended Dietary Allowance (RDA) for vitamin E for males and females 

<h1>Text Summaries for vitamins</h1>

In [5]:
# Importing dependencies from transformers
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Load tokenizer 
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

In [7]:
# Load model 
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
vitamins_links = {
    'vitamin-a': 'https://www.hsph.harvard.edu/nutritionsource/vitamin-a',
    'vitamin-b1': 'https://www.hsph.harvard.edu/nutritionsource/vitamin-b1',
    'vitamin-b2': 'https://www.hsph.harvard.edu/nutritionsource/riboflavin-vitamin-b2',
    'vitamin-b3': 'https://www.hsph.harvard.edu/nutritionsource/niacin-vitamin-b3',
    'vitamin-b5': 'https://www.hsph.harvard.edu/nutritionsource/pantothenic-acid-vitamin-b5',
    'vitamin-b6': 'https://www.hsph.harvard.edu/nutritionsource/vitamin-b6',
    'vitamin-b7': 'https://www.hsph.harvard.edu/nutritionsource/biotin-vitamin-b7',
    'vitamin-b9': 'https://www.hsph.harvard.edu/nutritionsource/folic-acid',
    'vitamin-b12': 'https://www.hsph.harvard.edu/nutritionsource/vitamin-b12',
    'vitamin-c': 'https://www.hsph.harvard.edu/nutritionsource/vitamin-c',
    'vitamin-d': 'https://www.hsph.harvard.edu/nutritionsource/vitamin-d',
    'vitamin-e': 'https://www.hsph.harvard.edu/nutritionsource/vitamin-e',
    'vitamin-k': 'https://www.hsph.harvard.edu/nutritionsource/vitamin-k'
}

In [38]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import requests
from bs4 import BeautifulSoup

def split_text_into_chunks(text, chunk_size):
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

def summarize_text(text, model, tokenizer):
    tokens = tokenizer(text, truncation=True, padding="longest", return_tensors="pt", max_length=1024)
    summary = model.generate(**tokens)
    return tokenizer.decode(summary[0], skip_special_tokens=True)

# Your URL
url = vitamins_links['vitamin-e']
response = requests.get(url)

if response.status_code == 200:
    # Store the HTML content in a variable
    webpage_content = response.text

    soup = BeautifulSoup(webpage_content, 'html.parser')

    # Assuming you already have the 'webpage_content' variable with HTML content
    entry_content_div = soup.find('div', class_='entry-content')

    if entry_content_div:
        main_content_text = entry_content_div.get_text(separator='\n').strip()
        main_content_text = main_content_text.split("Related", 1)[0].strip()

        # Split the text into chunks of 1000 words (adjust as needed)
        chunk_size = 700
        text_chunks = split_text_into_chunks(main_content_text, chunk_size)

        # Summarize each chunk
        summaries = [summarize_text(chunk, model, tokenizer) for chunk in text_chunks]

        # Concatenate the summaries
        final_summary = ' '.join(summaries)

#         print('Main Content:')
#         print(main_content_text)
        print('\nSummary:')
        print(final_summary)
    else:
        print('No <div> element with class "entry-content" found')
else:
    print('Error fetching the webpage')



Summary:
What is vitamin E? Vitamin E has been linked to a number of health benefits. Vitamin E supplements have been shown to reduce the risk of heart attack and stroke. Vitamin E supplements do not appear to reduce the risk of heart attack, stroke, or cardiovascular death. Vitamin E supplements do not prevent heart disease, according to two large clinical trials. There is no evidence that taking vitamin E reduces the risk of heart disease. Vitamin E supplements do not appear to protect against cancer and heart disease, according to two large clinical trials. Vitamin E has been shown to reduce the risk of heart disease in animal studies. Vitamin E supplements did not reduce the risk of prostate cancer in a large clinical trial. A long-running trial of vitamin E and prostate cancer prevention has ended with disappointing results. Vitamin E supplements may reduce the risk of prostate cancer, according to two large clinical trials. A large trial of a drug to prevent prostate cancer has 

In [39]:
print(*final_summary.split('.'),sep='\n')

What is vitamin E? Vitamin E has been linked to a number of health benefits
 Vitamin E supplements have been shown to reduce the risk of heart attack and stroke
 Vitamin E supplements do not appear to reduce the risk of heart attack, stroke, or cardiovascular death
 Vitamin E supplements do not prevent heart disease, according to two large clinical trials
 There is no evidence that taking vitamin E reduces the risk of heart disease
 Vitamin E supplements do not appear to protect against cancer and heart disease, according to two large clinical trials
 Vitamin E has been shown to reduce the risk of heart disease in animal studies
 Vitamin E supplements did not reduce the risk of prostate cancer in a large clinical trial
 A long-running trial of vitamin E and prostate cancer prevention has ended with disappointing results
 Vitamin E supplements may reduce the risk of prostate cancer, according to two large clinical trials
 A large trial of a drug to prevent prostate cancer has found that

In [42]:
final

Unnamed: 0,Vitamin,Disease,Age Group,Gender
0,vitamin-a,Night blindness,Children,Female
1,vitamin-a,Xerophthalmia,Children,Female
2,vitamin-a,Keratomalacia,Children,Female
3,vitamin-a,Bitot's spots,Children,Female
4,vitamin-a,Hyperkeratosis,Adults,Female
...,...,...,...,...
125,vitamin-k,Hemorrhagic disease of the newborn,Newborns,Female
126,vitamin-k,Osteoporosis,Elderly,Female
127,vitamin-k,Impaired wound healing,Adults,Female
128,vitamin-k,Hematoma,Adults,Female


In [12]:
import requests
from bs4 import BeautifulSoup

def extract_important_text(title):
    url = f'https://en.wikipedia.org/wiki/{title}'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        # Extract the main content of the page
        main_content = soup.find('div', {'id': 'mw-content-text'})

        if main_content:
            important_text = ' '.join([p.get_text() for p in main_content.find_all('p')])
            return important_text
        else:
            return 'Error: Main content not found on the Wikipedia page.'
    else:
        return f'Error fetching the Wikipedia page. Status code: {response.status_code}'

# Example usage
disease_name = 'Night blindness'
important_text = extract_important_text(disease_name)


In [13]:
print(important_text)

Nyctalopia (/ˌnɪktəˈloʊpiə/; from Ancient Greek  νύκτ- (núkt-) 'night',  ἀλαός (alaós) 'blind, invisible', and  ὄψ (óps) 'eye'),[1] also called night-blindness, is a condition making it difficult or impossible to see in relatively low light. It is a symptom of several eye diseases. Night blindness may exist from birth, or be caused by injury or malnutrition (for example, vitamin A deficiency). It can be described as insufficient adaptation to darkness.
 The most common cause of nyctalopia is retinitis pigmentosa, a disorder in which the rod cells in the retina gradually lose their ability to respond to the light. Patients with this genetic condition have progressive nyctalopia and eventually, their daytime vision may also be affected. In X-linked congenital stationary night blindness, from birth the rods either do not work at all, or work very little, but the condition does not get worse.
 Another cause of night blindness is a deficiency of retinol, or vitamin A1, found in fish oils, l

In [9]:
def split_text_into_chunks(text, chunk_size):
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

def summarize_text(text, model, tokenizer):
    tokens = tokenizer(text, truncation=True, padding="longest", return_tensors="pt", max_length=1024)
    summary = model.generate(**tokens)
    return tokenizer.decode(summary[0], skip_special_tokens=True)


# Split the text into chunks of 1000 words (adjust as needed)
chunk_size = 500
text_chunks = split_text_into_chunks(important_text, chunk_size)

        # Summarize each chunk
summaries = [summarize_text(chunk, model, tokenizer) for chunk in text_chunks]

        # Concatenate the summaries
final_summary = ' '.join(summaries)

print('\nSummary:')
print(final_summary)


Summary:
Night blindness, also known as night blindness, is a condition making it difficult or impossible to see in relatively low light. Night blindness is a condition in which the retina does not respond to light. Night blindness is the result of loss of peripheral vision, or the ability to see in dim light. Poor night vision is caused by a deficiency in rhodopsin. Night blindness can be caused by a number of factors, including: The word "nyctalopia", meaning "night blindness", comes from the Latin nyctalopie, meaning "night vision". Night blindness is a word from Latin  (nuktlps), meaning "to see in the dark". Congenital stationary night blindness is an ophthalmologic disorder in horses with leopard spotting patterns, such as the Appaloosa. Night blindness is a common condition in horses. The Lp gene has been pinpointed for the first time in a horse.


In [65]:
final_summary

'Night blindness, also known as night blindness, is a condition making it difficult or impossible to see in relatively low light. Night blindness is a condition in which the retina does not respond to light. Night blindness is the result of loss of peripheral vision, or the ability to see in dim light. Poor night vision is caused by a deficiency in rhodopsin. Night blindness can be caused by a number of factors, including: The word "nyctalopia", meaning "night blindness", comes from the Latin nyctalopie, meaning "night vision". Night blindness is a word from Latin  (nuktlps), meaning "to see in the dark". Congenital stationary night blindness is an ophthalmologic disorder in horses with leopard spotting patterns, such as the Appaloosa. Night blindness is a common condition in horses. The Lp gene has been pinpointed for the first time in a horse.'

In [73]:
final

Unnamed: 0,Vitamin,Disease,Age Group,Gender,links
0,vitamin-a,Night blindness,Children,Female,https://www.hsph.harvard.edu/nutritionsource/v...
1,vitamin-a,Xerophthalmia,Children,Female,https://www.hsph.harvard.edu/nutritionsource/v...
2,vitamin-a,Keratomalacia,Children,Female,https://www.hsph.harvard.edu/nutritionsource/v...
3,vitamin-a,Bitot's spots,Children,Female,https://www.hsph.harvard.edu/nutritionsource/v...
4,vitamin-a,Hyperkeratosis,Adults,Female,https://www.hsph.harvard.edu/nutritionsource/v...
...,...,...,...,...,...
125,vitamin-k,Hemorrhagic disease of the newborn,Newborns,Female,https://www.hsph.harvard.edu/nutritionsource/v...
126,vitamin-k,Osteoporosis,Elderly,Female,https://www.hsph.harvard.edu/nutritionsource/v...
127,vitamin-k,Impaired wound healing,Adults,Female,https://www.hsph.harvard.edu/nutritionsource/v...
128,vitamin-k,Hematoma,Adults,Female,https://www.hsph.harvard.edu/nutritionsource/v...


In [None]:
import wikipediaapi
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

def extract_important_text_wiki(title):
    wiki_wiki = wikipediaapi.Wikipedia('en')
    page_py = wiki_wiki.page(title)

    if page_py.exists():
        important_text = page_py.text
        return important_text
    else:
        return 'Error: Wikipedia page not found.'

# Example usage
disease_name = 'nightblindness'
important_text = extract_important_text_wiki(disease_name)

def summarize_text(text, model, tokenizer):
    tokens = tokenizer(text, truncation=True, padding="longest", return_tensors="pt", max_length=1024)
    summary = model.generate(**tokens)
    return tokenizer.decode(summary[0], skip_special_tokens=True)

# Split the text into chunks of 500 words (adjust as needed)
chunk_size = 500
text_chunks = split_text_into_chunks(important_text, chunk_size)

# Summarize each chunk
summaries = [summarize_text(chunk, model, tokenizer) for chunk in text_chunks]

# Concatenate the summaries
final_summary = ' '.join(summaries)

print('\nSummary:')
print(final_summary)


<h1>Saving Files</h1>

In [72]:
import pickle

In [74]:
with open('vitamins_info.pkl','wb') as file:
    pickle.dump(final,file)