In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import re
from io import StringIO

# Define the directory containing the files
directory = 'Resources/IDF'

# Initialize an empty DataFrame for the combined data
combined_data = pd.DataFrame(columns=["Country", "Year", "Percent Population with Diabetes"])

# Function to read HTML content from the files
def read_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

# Function to clean and extract data from HTML content
def clean_and_extract_data_from_html(html_content):
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        table = soup.find('table')
        data = pd.read_html(StringIO(str(table)))[0]

        # Extracting relevant rows based on the Metric column values
        diabetes_regex = re.compile(r'People with diabetes.*')
        population_regex = re.compile(r'Total adult population.*')

        people_with_diabetes = data[data[0].str.match(diabetes_regex, na=False)]
        total_population = data[data[0].str.match(population_regex, na=False)]

        # Ensure the first row is used as columns for the melted data
        years = data.iloc[0, 1:]

        # Reshaping the data
        people_with_diabetes = people_with_diabetes.melt(id_vars=[0], value_vars=years.index, var_name='Year', value_name='People with Diabetes')
        total_population = total_population.melt(id_vars=[0], value_vars=years.index, var_name='Year', value_name='Total Population')

        # Assign correct years based on the first row
        people_with_diabetes['Year'] = people_with_diabetes['Year'].map(lambda x: years[x])
        total_population['Year'] = total_population['Year'].map(lambda x: years[x])

        # Dropping the first column which is no longer needed
        people_with_diabetes = people_with_diabetes.drop(columns=[0])
        total_population = total_population.drop(columns=[0])

        # Convert non-numeric values to NaN and then to numeric
        people_with_diabetes['People with Diabetes'] = pd.to_numeric(people_with_diabetes['People with Diabetes'], errors='coerce')
        total_population['Total Population'] = pd.to_numeric(total_population['Total Population'], errors='coerce')

        # Merging the data on Year
        merged_data = pd.merge(people_with_diabetes, total_population, on='Year')

        # Calculating the percentage of the population with diabetes
        merged_data['Percent Population with Diabetes'] = (merged_data['People with Diabetes'] * 1000) / (merged_data['Total Population'] * 1000) * 100

        # Dropping rows with NaN values in the calculated percentage
        final_data = merged_data[['Year', 'Percent Population with Diabetes']].dropna()

        return final_data
    except Exception as e:
        print(f"Error processing file: {e}")
        return pd.DataFrame()

# List all files in the directory
files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# Process each file
for file in files:
    file_path = os.path.join(directory, file)
    country = file.split('-')[0]
    
    html_content = read_html_file(file_path)
    country_data = clean_and_extract_data_from_html(html_content)
    if not country_data.empty:
        country_data['Country'] = country
        combined_data = pd.concat([combined_data, country_data])

# Reset index
combined_data.reset_index(drop=True, inplace=True)

# Save the combined data to a CSV file
output_file = os.path.join('Resources\combined_diabetes_data.csv')
combined_data.to_csv(output_file, index=False)

print("Combined Data:")
combined_data.head()




In [None]:

# Load the data from the CSV files located in the Resources directory
caloric_supply = pd.read_csv('Resources/daily-per-capita-caloric-supply.csv')
protein_supply = pd.read_csv('Resources/daily-per-capita-protein-supply.csv')
animal_plant_protein = pd.read_csv('Resources/daily-protein-supply-from-animal-and-plant-based-foods.csv')
meat_consumption = pd.read_csv('Resources/daily-meat-consumption-per-person.csv')
diabetes_data = pd.read_csv('Resources/combined_diabetes_data.csv')

# Rename the necessary columns to ensure consistency
caloric_supply.rename(columns={'Entity': 'Country', 'Daily caloric supply (OWID based on UN FAO & historical sources)': 'Daily Caloric Supply'}, inplace=True)
protein_supply.rename(columns={'Entity': 'Country', 'Total | 00002901 || Food available for consumption | 0674pc || grams of protein per day per capita': 'Daily Protein Supply'}, inplace=True)
animal_plant_protein.rename(columns={
    'Entity': 'Country',
    'Animal Products | 00002941 || Food available for consumption | 0674pc || grams of protein per day per capita': 'Animal Protein Supply',
    'Vegetal Products | 00002903 || Food available for consumption | 0674pc || grams of protein per day per capita': 'Vegetal Protein Supply'
}, inplace=True)
meat_consumption.rename(columns={'Entity': 'Country', 'Meat, total | 00002943 || Food available for consumption | 0645pc || kilograms per year per capita': 'Daily Meat Consumption'}, inplace=True)

# Ensure the 'Year' column is consistently named
caloric_supply.rename(columns={'Year': 'Year'}, inplace=True)
protein_supply.rename(columns={'Year': 'Year'}, inplace=True)
animal_plant_protein.rename(columns={'Year': 'Year'}, inplace=True)
meat_consumption.rename(columns={'Year': 'Year'}, inplace=True)

# Convert 'Daily Meat Consumption' from kg/year to g/day for consistency
meat_consumption['Daily Meat Consumption'] = meat_consumption['Daily Meat Consumption'] * 1000 / 365

# Rename columns in diabetes_data
diabetes_data.rename(columns={'Percent Population with Diabetes': 'Diabetes Prevalence'}, inplace=True)

# Merge the dataframes on 'Country' and 'Year'
merged_df = caloric_supply.merge(protein_supply, on=['Country', 'Year'], how='outer')
merged_df = merged_df.merge(animal_plant_protein, on=['Country', 'Year'], how='outer')
merged_df = merged_df.merge(meat_consumption[['Country', 'Year', 'Daily Meat Consumption']], on=['Country', 'Year'], how='outer')
merged_df = merged_df.merge(diabetes_data[['Country', 'Year', 'Diabetes Prevalence']], on=['Country', 'Year'], how='outer')

# Select only the required columns
final_df = merged_df[[
    'Country', 
    'Year', 
    'Daily Caloric Supply', 
    'Daily Protein Supply', 
    'Animal Protein Supply', 
    'Vegetal Protein Supply', 
    'Daily Meat Consumption', 
    'Diabetes Prevalence'
]]

# Drop rows with any missing data
final_df.dropna(inplace=True)

# Save the final DataFrame to a new CSV file
final_df.to_csv('Resources/final_data.csv', index=False)

print("Final DataFrame columns:", final_df.columns)
print("Number of rows in Final DataFrame:", len(final_df))


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'Resources/final_data.csv'
data = pd.read_csv(file_path)

# Select only the relevant features
features = ['Daily Caloric Supply', 'Animal Protein Supply', 'Vegetal Protein Supply']
target = 'Diabetes Prevalence'

# Preprocess the data
data = data.dropna()  # Remove missing values for simplicity

# Define the features (X) and target (y)
X = data[features]
y = data[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the model
model = Sequential([
    Dense(80, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dense(64, activation='relu'),
    Dropout(0.3),
    BatchNormalization(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    BatchNormalization(),
    Dense(1, activation='linear')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Define a learning rate scheduler
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * np.exp(-0.1)

callback = LearningRateScheduler(scheduler)

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_data=(X_test_scaled, y_test), callbacks=[callback])

# Save the model
model.save('Resources/diabetes_prevalence_model.h5')

# Predict on the test set
y_pred = model.predict(X_test_scaled).flatten()

# Evaluate the model on test data
mse = mean_squared_error(y_test, y_pred)
mae = np.mean(np.abs(y_test - y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Test MSE: {mse:.4f}")
print(f"Test MAE: {mae:.4f}")
print(f"Test R²: {r2:.4f}")

# Plot training & validation loss values
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()
