In [1]:
import streamlit as st
import pandas as pd
import seaborn as sns
import plotly.express as px
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression


In [2]:
data=pd.read_csv('Life-Expectancy-Data-Averaged.csv')
data.head()

Unnamed: 0,Country,Region,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status,Life_expectancy
0,Afghanistan,Asia,2007.5,71.08125,98.6125,265.804969,0.016125,64.5625,24.375,22.4625,55.375,55.125,0.0225,408.5625,27.450625,16.58125,15.58125,2.9,1.0,59.65625
1,Albania,Rest of Europe,2007.5,15.25625,17.14375,83.132969,4.696875,98.0,95.9375,25.85625,98.125,98.0625,0.025625,3071.125,2.969375,1.61875,1.7,9.24375,1.0,75.95
2,Algeria,Africa,2007.5,26.75625,31.19375,113.439281,0.400625,88.3125,93.25,24.86875,91.75,91.875,0.021875,3745.125,34.820625,6.09375,5.975,6.99375,1.0,73.7875
3,Angola,Africa,2007.5,88.76875,144.1625,297.844063,4.935625,68.8125,64.0,22.51875,35.75,55.5625,1.30375,2647.8125,21.62375,6.19375,6.66875,4.60625,1.0,52.825
4,Antigua and Barbuda,Central America and Caribbean,2007.5,9.475,11.51875,142.478813,7.755,98.25,75.4375,25.85,96.9375,98.3125,0.125,14678.75,0.085,3.425,3.375,9.01875,1.0,75.35


In [3]:
# Summary Statistics
data.describe()


Unnamed: 0,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status,Life_expectancy
count,179.0,179.0,179.0,179.0,179.0,179.0,179.0,179.0,179.0,179.0,179.0,179.0,179.0,179.0,179.0,179.0,179.0,179.0
mean,2007.5,30.363792,42.938268,192.251775,4.820882,84.292598,77.344972,25.032926,86.499651,86.271648,0.894288,11540.92493,36.675915,4.865852,4.899825,7.632123,0.793296,68.856075
std,0.0,26.725485,42.916952,111.659044,3.914554,13.820223,17.315208,2.16549,13.581153,13.931532,2.311895,16893.054182,136.655286,4.111094,4.195663,3.126912,0.406077,9.197699
min,2007.5,2.38125,3.0,57.710313,2.5e-05,30.6875,16.25,20.2125,35.75,31.3125,0.01,263.9375,0.085,0.1,0.1,1.3375,0.0,45.60625
25%,2007.5,8.159375,9.775,107.046906,1.317813,78.21875,64.0,23.225,80.53125,80.8125,0.08,1409.90625,2.108125,1.75625,1.73125,4.946875,1.0,62.303125
50%,2007.5,19.36875,23.1375,164.432406,4.209375,88.0,83.0,25.65,92.375,92.0625,0.164375,4402.625,7.660625,3.55625,3.71875,7.83125,1.0,71.50625
75%,2007.5,48.959375,68.321875,247.523922,7.843438,94.375,92.25,26.425,96.0625,95.78125,0.51625,12037.78125,22.745313,7.165625,7.05625,10.365625,1.0,74.9375
max,2007.5,115.71875,178.725,572.974312,15.1,98.875,99.0,31.6875,98.9375,99.0,18.164375,102972.6875,1321.239375,27.1,27.94375,13.26875,1.0,82.45625


In [4]:
# Calculate key metrics
avg_life = data['Life_expectancy'].mean()
avg_gdp = data['GDP_per_capita'].mean()
avg_schooling = data['Schooling'].mean()
countries_count = len(data['Country'].unique())

# Display the key metrics
print(f"Average Life Expectancy: {avg_life:.1f} years")
print(f"Average GDP per Capita: ${avg_gdp:,.0f}")
print(f"Average Years of Schooling: {avg_schooling:.1f}")
print(f"Countries Analyzed: {countries_count}")


Average Life Expectancy: 68.9 years
Average GDP per Capita: $11,541
Average Years of Schooling: 7.6
Countries Analyzed: 179


In [5]:
# Select relevant numerical columns for the pairplot
pairplot_vars = ['Life_expectancy', 'GDP_per_capita', 'Schooling', 'Adult_mortality', 'BMI', 'Alcohol_consumption']

# Scatter matrix (pairplot)
fig = px.scatter_matrix(
    data,
    dimensions=pairplot_vars,
    title="Relationships between Key Variables",
    color='Economy_status',
    opacity=0.7
)
fig.update_layout(height=800, width=800)
fig.update_traces(diagonal_visible=False)

# Show the plot
fig.show()


In [6]:
# Calculate global region statistics
region_stats = data.groupby('Region')['Life_expectancy'].agg([
    ('Mean', 'mean'),
    ('Min', 'min'),
    ('Max', 'max')
]).round(1)

# Display global region metrics
global_avg = data['Life_expectancy'].mean()
highest_region = region_stats['Mean'].idxmax()
lowest_region = region_stats['Mean'].idxmin()

print(f"Global Average Life Expectancy: {global_avg:.1f} years")
print(f"Highest Region: {highest_region}")
print(f"Lowest Region: {lowest_region}")


Global Average Life Expectancy: 68.9 years
Highest Region: North America
Lowest Region: Africa


In [7]:
# Box plot of life expectancy by region
fig_box = px.box(
    data,
    x='Region',
    y='Life_expectancy',
    color='Region',
    title='Life Expectancy Distribution by Region',
    labels={'Life_expectancy': 'Life Expectancy (years)'}
)
fig_box.update_layout(xaxis_tickangle=-45)

# Show the box plot
fig_box.show()


In [8]:
# Bar plot of average life expectancy by region
fig_avg = px.bar(
    region_stats,
    y='Mean',
    color=region_stats.index,
    title='Average Life Expectancy by Region',
    labels={'Mean': 'Life Expectancy (years)', 'index': 'Region'}
)
fig_avg.update_layout(xaxis_tickangle=-45, showlegend=False)

# Show the bar plot
fig_avg.show()


In [9]:
# Box plot of infant deaths by region
fig_infant = px.box(
    data,
    x='Region',
    y='Infant_deaths',
    color='Region',
    title='Infant Deaths Distribution by Region',
    labels={'Infant_deaths': 'Infant Deaths'}
)
fig_infant.update_layout(xaxis_tickangle=-45)

# Show the box plot
fig_infant.show()


In [10]:
# Global BMI Overview
global_bmi_mean = data['BMI'].mean()
highest_bmi = data.groupby('Region')['BMI'].mean().max()
lowest_bmi = data.groupby('Region')['BMI'].mean().min()

# Print the overview metrics
print(f"Global Average BMI: {global_bmi_mean:.1f}")
print(f"Highest Regional BMI: {highest_bmi:.1f}")
print(f"Lowest Regional BMI: {lowest_bmi:.1f}")


Global Average BMI: 25.0
Highest Regional BMI: 27.4
Lowest Regional BMI: 23.2


In [11]:
# Interactive region selection (if implemented in a streamlit-like environment)
selected_region = 'All Regions'  # Change based on selection

# Filter the data based on region selection
filtered_data = data if selected_region == 'All Regions' else data[data['Region'] == selected_region]


In [12]:
import plotly.express as px

# BMI Distribution by Region (Box plot)
fig_bmi_region = px.box(
    filtered_data,
    x='Region',
    y='BMI',
    color='Region',
    title='BMI Distribution by Region',
    labels={'BMI': 'Body Mass Index (BMI)'}
)
fig_bmi_region.update_layout(xaxis_tickangle=-45)

# Show the figure (use in a notebook-like environment)
fig_bmi_region.show()


In [13]:
# BMI by Economic Status (Violin plot)
fig_bmi_economy = px.violin(
    filtered_data,
    x='Economy_status',
    y='BMI',
    color='Economy_status',
    box=True,
    title='BMI by Economic Status'
)

fig_bmi_economy.show()


In [14]:
# BMI vs Life Expectancy (Scatter plot)
fig_bmi_life_exp = px.scatter(
    filtered_data,
    x='BMI',
    y='Life_expectancy',
    color='Economy_status',
    size='GDP_per_capita',
    hover_data=['Country'],
    title='BMI vs Life Expectancy',
    labels={'BMI': 'Body Mass Index (BMI)', 'Life_expectancy': 'Life Expectancy (years)'}
)

fig_bmi_life_exp.show()


In [15]:
# BMI vs Adult Mortality (Scatter plot)
fig_bmi_adult = px.scatter(
    filtered_data,
    x='BMI',
    y='Adult_mortality',
    color='Economy_status',
    size='GDP_per_capita',
    hover_data=['Country'],
    title='BMI vs Adult Mortality Rate',
    labels={'BMI': 'Body Mass Index (BMI)', 'Adult_mortality': 'Adult Mortality Rate'}
)

fig_bmi_adult.show()


In [16]:
# Calculate the BMI threshold (75th percentile)
bmi_threshold = data['BMI'].quantile(0.75)

# Identify high-risk countries based on BMI threshold
high_risk_bmi = data[data['BMI'] > bmi_threshold]

# High-Risk Countries by Region
risk_by_region_bmi = high_risk_bmi.groupby('Region').size().sort_values(ascending=False)

# Plot the high-risk countries by region (Bar plot)
fig_risk_bmi = px.bar(
    risk_by_region_bmi,
    title='Number of High-Risk Countries by Region (High BMI)',
    labels={'value': 'Number of Countries', 'Region': 'Region'}
)

fig_risk_bmi.show()

# Risk Factors Comparison
risk_factors_bmi = high_risk_bmi[['GDP_per_capita', 'Schooling', 'Adult_mortality']].mean()
safe_factors_bmi = data[data['BMI'] <= bmi_threshold][['GDP_per_capita', 'Schooling', 'Adult_mortality']].mean()

comparison_bmi = pd.DataFrame({
    'High Risk (BMI > 75th percentile)': risk_factors_bmi,
    'Low Risk (BMI <= 75th percentile)': safe_factors_bmi
}).round(2)

# Display the comparison table
print(comparison_bmi)


                 High Risk (BMI > 75th percentile)  \
GDP_per_capita                            17193.83   
Schooling                                     9.29   
Adult_mortality                             133.37   

                 Low Risk (BMI <= 75th percentile)  
GDP_per_capita                             9642.56  
Schooling                                     7.07  
Adult_mortality                             212.03  


In [17]:
# Key Findings: Correlation Analysis
bmi_life_exp_corr = data['BMI'].corr(data['Life_expectancy'])
bmi_adult_corr = data['BMI'].corr(data['Adult_mortality'])

# Display correlations and key findings
print(f"BMI vs Life Expectancy Correlation: {bmi_life_exp_corr:.3f}")
print(f"BMI vs Adult Mortality Correlation: {bmi_adult_corr:.3f}")

# Additional Key Observations
print("\nKey Findings:")
print(f"- Highest BMI Region: {data.groupby('Region')['BMI'].mean().idxmax()}")
print(f"- Regions with High BMI often have lower life expectancy and higher mortality rates.")


BMI vs Life Expectancy Correlation: 0.594
BMI vs Adult Mortality Correlation: -0.522

Key Findings:
- Highest BMI Region: North America
- Regions with High BMI often have lower life expectancy and higher mortality rates.


In [18]:
# Global GDP Analysis
global_gdp_mean = data['GDP_per_capita'].mean()

# Life expectancy for high GDP countries (above median GDP)
high_gdp_life = data[data['GDP_per_capita'] > data['GDP_per_capita'].median()]['Life_expectancy'].mean()

# Life expectancy for low GDP countries (below or equal to median GDP)
low_gdp_life = data[data['GDP_per_capita'] <= data['GDP_per_capita'].median()]['Life_expectancy'].mean()

# Print the global statistics
print(f"Global Average GDP: ${global_gdp_mean:,.0f}")
print(f"Life Expectancy (High GDP): {high_gdp_life:.1f} years")
print(f"Life Expectancy (Low GDP): {low_gdp_life:.1f} years")


Global Average GDP: $11,541
Life Expectancy (High GDP): 75.0 years
Life Expectancy (Low GDP): 62.8 years


In [19]:
import plotly.express as px

# GDP vs Life Expectancy scatter plot
fig_gdp_life = px.scatter(
    data,
    x='GDP_per_capita',
    y='Life_expectancy',
    color='Economy_status',
    title='GDP per Capita vs Life Expectancy',
    labels={
        'GDP_per_capita': 'GDP per Capita ($)',
        'Life_expectancy': 'Life Expectancy (years)'
    },
    hover_data=['Country']
)

# Display the scatter plot
fig_gdp_life.show()


In [20]:
# Life Expectancy by Economic Status (Box plot)
fig_economy = px.box(
    data,
    x='Economy_status',
    y='Life_expectancy',
    color='Economy_status',
    title='Life Expectancy Distribution by Economic Status',
    labels={'Life_expectancy': 'Life Expectancy (years)'}
)

# Display the box plot
fig_economy.show()


In [21]:
# Select the region for detailed analysis (use a specific region or 'All Regions')
selected_region = 'All Regions'  # Change based on your selection

# Filter the data based on region selection
filtered_data = data if selected_region == 'All Regions' else data[data['Region'] == selected_region]


In [22]:
# Regional GDP Statistics
regional_gdp_mean = filtered_data['GDP_per_capita'].mean()

# Highest GDP country in the selected region
highest_gdp_country = filtered_data.loc[filtered_data['GDP_per_capita'].idxmax(), 'Country']
highest_gdp = filtered_data['GDP_per_capita'].max()

# Lowest GDP country in the selected region
lowest_gdp_country = filtered_data.loc[filtered_data['GDP_per_capita'].idxmin(), 'Country']
lowest_gdp = filtered_data['GDP_per_capita'].min()

# Print regional statistics
print(f"Regional Average GDP: ${regional_gdp_mean:,.0f}")
print(f"Highest GDP: ${highest_gdp:,.0f} ({highest_gdp_country})")
print(f"Lowest GDP: ${lowest_gdp:,.0f} ({lowest_gdp_country})")


Regional Average GDP: $11,541
Highest GDP: $102,973 (Luxembourg)
Lowest GDP: $264 (Somalia)


In [23]:
# Top 10 GDP countries in the selected region
top_gdp = filtered_data.nlargest(10, 'GDP_per_capita')

# Bar plot of Top 10 GDP countries in the selected region
fig_top_gdp = px.bar(
    top_gdp,
    x='Country',
    y='GDP_per_capita',
    color='Economy_status',
    title=f'Top 10 Countries by GDP per Capita ({selected_region})',
    labels={'GDP_per_capita': 'GDP per Capita ($)'}
)

fig_top_gdp.update_layout(xaxis_tickangle=-45)

# Display the bar plot
fig_top_gdp.show()


In [24]:
# GDP vs Life Expectancy in the selected region (Scatter plot)
fig_region_gdp = px.scatter(
    filtered_data,
    x='GDP_per_capita',
    y='Life_expectancy',
    color='Economy_status',
    title=f'GDP vs Life Expectancy in {selected_region}',
    labels={
        'GDP_per_capita': 'GDP per Capita ($)',
        'Life_expectancy': 'Life Expectancy (years)'
    },
    hover_data=['Country']
)

# Display the scatter plot for the selected region
fig_region_gdp.show()


In [25]:
# Select numerical columns for correlation analysis
numerical_cols = [col for col in ['Life_expectancy', 'Adult_mortality', 
                                  'Alcohol_consumption', 'BMI', 
                                  'GDP_per_capita', 'Schooling'] 
                  if col in data.columns]

# Check if we have enough features
if len(numerical_cols) < 2:
    raise ValueError("Not enough numerical features available in the dataset.")

# Calculate the correlation matrix
correlation_matrix = data[numerical_cols].corr()

# Plot the correlation heatmap
fig_corr = px.imshow(
    correlation_matrix,
    title='Feature Correlation Heatmap',
    color_continuous_scale='RdBu',
    aspect='auto',
    labels=dict(color="Correlation")
)
fig_corr.show()


In [26]:
# Define selected features
selected_features = ['Adult_mortality', 'Schooling', 'GDP_per_capita', 'BMI']
if not all(feature in data.columns for feature in selected_features):
    raise ValueError("Some required features are missing from the dataset.")

# Prepare features (X) and target (y)
X = data[selected_features].copy()
y = data['Life_expectancy'].copy()

# Handle missing values
X = X.fillna(X.mean())
y = y.fillna(y.mean())

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [27]:
# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Display performance metrics
print(f"R² Score: {r2:.3f}")
print(f"RMSE: {rmse:.2f} years")
print(f"MAE: {mae:.2f} years")


R² Score: 0.940
RMSE: 2.31 years
MAE: 1.79 years


In [28]:
# Predict life expectancy based on input
input_data = np.array([[150, 12, 12000, 25]])  # Replace with actual input values
input_scaled = scaler.transform(input_data)
prediction = model.predict(input_scaled)[0]

# Validate and display prediction
if 0 <= prediction <= 100:
    print(f"Predicted Life Expectancy: {prediction:.1f} years")
else:
    print("Invalid prediction value. Please check input values.")


Predicted Life Expectancy: 75.0 years



X does not have valid feature names, but StandardScaler was fitted with feature names



In [29]:
# Calculate feature importance
importance_df = pd.DataFrame({
    'Feature': selected_features,
    'Importance': np.abs(model.coef_)
}).sort_values('Importance', ascending=False)

# Plot feature importance
fig_importance = px.bar(
    importance_df,
    x='Feature',
    y='Importance',
    title='Feature Importance in Prediction'
)
fig_importance.show()
