In [2]:
import pandas as pd
import os
import matplotlib.pyplot as plt

def clone_repo(repo_url):
    repo_name = repo_url.split('/')[-1].replace('.git', '')
    if not os.path.exists(repo_name):
        print(f"Cloning repository from {repo_url}...")
        os.system(f"git clone {repo_url}")
    else:
        print(f"Repository {repo_name} already cloned.")
    return repo_name

# Clone the Health and Obesity Trends repository
repo_url = "https://github.com/ine-rmotr-projects/RDP-health-and-obesity-trends.git"
repo_name = clone_repo(repo_url)

# Locate the dataset file
possible_files = ["obesity-cleaned.csv", "data/obesity-cleaned.csv"]
data_file = None
for file in possible_files:
    file_path = os.path.join(repo_name, file)
    if os.path.exists(file_path):
        data_file = file_path
        break

if not data_file:
    raise FileNotFoundError("Data file not found in the repository. Please check the repository structure.")

# Load the dataset
data = pd.read_csv(data_file)

# Preview the data
print("Preview of the dataset:")
print(data.head())

# Display basic information about the dataset
print("\nDataset information:")
print(data.info())

# Check for missing values
missing_values = data.isnull().sum()
print("\nMissing values:")
print(missing_values)

# Analyze the dataset
# 1. Overall obesity trends by year
obesity_trends = data.groupby("Year")[["ObesityRate"]].mean()
print("\nAverage obesity trends by year:")
print(obesity_trends)

# 2. Obesity trends by gender
gender_trends = data.groupby(["Year", "Gender"])["ObesityRate"].mean().unstack()
print("\nObesity trends by gender:")
print(gender_trends)

# 3. Obesity trends by region
region_trends = data.groupby(["Year", "Region"])["ObesityRate"].mean().unstack()
print("\nObesity trends by region:")
print(region_trends)

# Visualize the trends
# Plot overall obesity trends
plt.figure(figsize=(10, 6))
plt.plot(obesity_trends, marker='o', label="Overall Obesity")
plt.title("Obesity Trends Over the Years")
plt.xlabel("Year")
plt.ylabel("Obesity Rate")
plt.grid()
plt.legend()
plt.show()

# Plot gender trends
gender_trends.plot(kind='line', figsize=(10, 6), marker='o')
plt.title("Obesity Trends by Gender")
plt.xlabel("Year")
plt.ylabel("Obesity Rate")
plt.grid()
plt.legend(title="Gender")
plt.show()

# Plot region trends
region_trends.plot(kind='line', figsize=(12, 8), marker='o')
plt.title("Obesity Trends by Region")
plt.xlabel("Year")
plt.ylabel("Obesity Rate")
plt.grid()
plt.legend(title="Region")
plt.show()

# Save processed data to a new file (optional)
output_file = os.path.join(repo_name, "processed_obesity_trends.csv")
data.to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")



Cloning repository from https://github.com/ine-rmotr-projects/RDP-health-and-obesity-trends.git...


FileNotFoundError: Data file not found: RDP-health-and-obesity-trends/data/obesity-cleaned.csv