In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings("ignore")

In [2]:
# Loading deaths risks dataset

risk_data = pd.read_csv('causes_death.csv')

risk_data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'causes_death.csv'

In [None]:
# Define a mapping of old column names to new simplified column names
column_mapping = {
    'Deaths - Invasive Non-typhoidal Salmonella (iNTS) - Sex: Both - Age: Under 5 (Number)': 'Invasive Non-typhoidal Salmonella (iNTS)',
    'Deaths - Interpersonal violence - Sex: Both - Age: Under 5 (Number)': 'Interpersonal violence',
    'Deaths - Nutritional deficiencies - Sex: Both - Age: Under 5 (Number)': 'Nutritional deficiencies',
    'Deaths - Acute hepatitis - Sex: Both - Age: Under 5 (Number)': 'Acute hepatitis',
    'Deaths - Neoplasms - Sex: Both - Age: Under 5 (Number)': 'Neoplasms',
    'Deaths - Measles - Sex: Both - Age: Under 5 (Number)': 'Measles',
    'Deaths - Digestive diseases - Sex: Both - Age: Under 5 (Number)': 'Digestive diseases',
    'Deaths - Cirrhosis and other chronic liver diseases - Sex: Both - Age: Under 5 (Number)': 'Cirrhosis and other chronic liver diseases',
    'Deaths - Chronic kidney disease - Sex: Both - Age: Under 5 (Number)': 'Chronic kidney disease',
    'Deaths - Cardiovascular diseases - Sex: Both - Age: Under 5 (Number)': 'Cardiovascular diseases',
    'Deaths - Congenital birth defects - Sex: Both - Age: Under 5 (Number)': 'Congenital birth defects',
    'Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Number)': 'Lower respiratory infections',
    'Deaths - Neonatal preterm birth - Sex: Both - Age: Under 5 (Number)': 'Neonatal preterm birth',
    'Deaths - Environmental heat and cold exposure - Sex: Both - Age: Under 5 (Number)': 'Environmental heat and cold exposure',
    'Deaths - Neonatal sepsis and other neonatal infections - Sex: Both - Age: Under 5 (Number)': 'Neonatal sepsis and other neonatal infections',
    'Deaths - Exposure to forces of nature - Sex: Both - Age: Under 5 (Number)': 'Exposure to forces of nature',
    'Deaths - Diabetes mellitus - Sex: Both - Age: Under 5 (Number)': 'Diabetes mellitus',
    'Deaths - Neonatal encephalopathy due to birth asphyxia and trauma - Sex: Both - Age: Under 5 (Number)': 'Neonatal encephalopathy due to birth asphyxia and trauma',
    'Deaths - Meningitis - Sex: Both - Age: Under 5 (Number)': 'Meningitis',
    'Deaths - Other neonatal disorders - Sex: Both - Age: Under 5 (Number)': 'Other neonatal disorders',
    'Deaths - Whooping cough - Sex: Both - Age: Under 5 (Number)': 'Whooping cough',
    'Deaths - Diarrheal diseases - Sex: Both - Age: Under 5 (Number)': 'Diarrheal diseases',
    'Deaths - Fire, heat, and hot substances - Sex: Both - Age: Under 5 (Number)': 'Fire, heat, and hot substances',
    'Deaths - Road injuries - Sex: Both - Age: Under 5 (Number)': 'Road injuries',
    'Deaths - Tuberculosis - Sex: Both - Age: Under 5 (Number)': 'Tuberculosis',
    'Deaths - HIV/AIDS - Sex: Both - Age: Under 5 (Number)': 'HIV/AIDS',
    'Deaths - Drowning - Sex: Both - Age: Under 5 (Number)': 'Drowning',
    'Deaths - Malaria - Sex: Both - Age: Under 5 (Number)': 'Malaria',
    'Deaths - Syphilis - Sex: Both - Age: Under 5 (Number)': 'Syphilis'
}

# Rename columns using the defined mapping
risk_data.rename(columns=column_mapping, inplace=True)

# Display the DataFrame with simplified column names
risk_data.head()



In [None]:
risk_data.tail()

In [None]:
# Look into a specific column Entity as we can see above we have non_countries such as world bank represented
#how many other non_countries are there?
column_name = risk_data['Entity']  

# Display the unique values or any other analysis of the specific column
print(column_name.unique()) 

In [None]:
#do away with the non_country entries because the do not focus on specific countries hence making them
#irrelevant to the analysis.

non_countries = [
    'African Region (WHO)', 'American Samoa', 'East Asia & Pacific (WB)',
    'Eastern Mediterranean Region (WHO)', 'England', 'Europe & Central Asia (WB)',
    'European Region (WHO)', 'G20', 'Latin America & Caribbean (WB)',
    'Micronesia (country)', 'Middle East & North Africa (WB)', 'North America (WB)',
    'North Korea', 'Northern Ireland', 'Northern Mariana Islands', 'OECD Countries',
    'Palestine', 'Region of the Americas (WHO)', 'Scotland', 'South Asia (WB)',
    'South-East Asia Region (WHO)', 'Sub-Saharan Africa (WB)', 'Taiwan',
    'Western Pacific Region (WHO)', 'World', 'World Bank High Income',
    'World Bank Low Income', 'World Bank Lower Middle Income',
    'World Bank Upper Middle Income', 'Wales'
]

# Filtering out non-country entities
risk_data = risk_data[~risk_data['Entity'].isin(non_countries)]



In [None]:
risk_data.info()

In [None]:
# Loading deaths risks dataset

child_data = pd.read_csv('per-capita-total-expenditure-on-health-vs-child-mortality.csv')

child_data.head()

In [None]:
# Convert 'Year' column to numeric (in case it's not already)
child_data['Year'] = pd.to_numeric(child_data['Year'], errors='coerce')

# Filter the DataFrame to include only years between 1990 and 2020
child_data_filtered = child_data[(child_data['Year'] >= 1990) & (child_data['Year'] <= 2020)]

# Display the cleaned DataFrame
child_data_filtered.head()

In [None]:
# Drop unwanted columns 
child_data_filtered.drop(columns=['Current health expenditure per capita, PPP (current international $)','Continent'], inplace=True)

# Display the DataFrame after dropping columns
child_data_filtered.head()

In [None]:
child_data_filtered.info()

In [None]:
# Merge the two DataFrames on the 'Entity'/'Country Name', 'Code', and 'Year' columns
merged_df = pd.merge(risk_data, child_data_filtered, on=['Entity', 'Code', 'Year'])

# Display the merged DataFrame
merged_df.head()


In [None]:
merged_df.info()

In [None]:
# Loading the dataset

sex_data = pd.read_csv('mortality_rates_sexes.csv', encoding='latin1')

sex_data.head()

In [None]:
# Drop unwanted columns 
sex_data.drop(columns=['Population (historical estimates)', 'Continent'], inplace=True)

# Display the DataFrame after dropping columns
sex_data.head()

In [None]:
# Merge the two DataFrames on the 'Entity'/'Country Name', 'Code', and 'Year' columns
merged_df2 = pd.merge(merged_df, sex_data, on=['Entity', 'Code', 'Year'])

# Display the merged DataFrame
merged_df2.head()



In [None]:
merged_df2.info()

In [None]:
# Loading the dataset

age_data = pd.read_csv('deaths_ages.csv', encoding='latin1')

age_data.head()

In [None]:
age_data.info()

In [None]:
# Merge the two DataFrames on the 'Entity'/'Country Name', 'Code', and 'Year' columns
merged_df3 = pd.merge(merged_df2, age_data, on=['Entity', 'Code', 'Year'])

# Display the merged DataFrame
merged_df3.head()

In [None]:
merged_df3.info()

In [None]:
merged_df3 = merged_df3.drop(columns=['Code'])


In [None]:
merged_df3.head()

In [None]:
#Check for null values and NaN
null_values = merged_df3.isnull().sum()

# Display the count of null values for each column
print(null_values)

In [None]:
columns_with_null = ['Mortality rate, under-5 (per 1,000 live births)', 'Mortality rate, under-5, female (per 1,000 live births)', 'Mortality rate, under-5, male (per 1,000 live births)']

# Filter the DataFrame to display rows where any of the specified columns have null values
null_rows = merged_df3[merged_df3[columns_with_null].isnull().any(axis=1)]

# Display the rows where any of the specified columns have null values
print(null_rows)

In [None]:
merged_df3 = merged_df3.dropna()

merged_df3.head()

In [None]:
merged_df3.info()

In [None]:
# Loading the GDP dataset

GDP_data = pd.read_csv('gdp.csv', encoding='latin1')

GDP_data.head()

In [None]:
import pandas as pd

# Melt the DataFrame to convert column data into rows
melted_df = pd.melt(GDP_data, id_vars=['Country Name'], var_name='Year', value_name='GDP')

# Print the resulting DataFrame
print(melted_df.head())

In [None]:
#Check for null values and NaN
null_values = melted_df.isnull().sum()

# Display the count of null values for each column
print(null_values)

In [None]:
gdp_df = melted_df.dropna()

gdp_df.head()

In [None]:
gdp_df.info()

In [None]:
gdp_df['Year'] = gdp_df['Year'].astype(int)

# Print the DataFrame to verify the changes
print(gdp_df.info())

In [None]:
# Merge the two DataFrames on the 'Country Name' and 'Entity' columns
final_df = pd.merge(merged_df3, gdp_df, left_on=['Entity','Year'], right_on=['Country Name','Year'])

# Drop the redundant 'Entity' column
final_df.drop('Country Name', axis=1, inplace=True)

# Display the merged DataFrame
final_df.head()

In [None]:
final_df.info()

In [None]:
#Check for null values and NaN
null_values = final_df.isnull().sum()

# Display the count of null values for each column
print(null_values)

In [None]:
from sklearn.ensemble import IsolationForest
# Handling Outliers
# Initializing the IsolationForest model with a contamination parameter of 0.05
model = IsolationForest(contamination=0.05, random_state=0)

# Fitting the model on our dataset (converting DataFrame to NumPy to avoid warning)
final_df['Outlier_Scores'] = model.fit_predict(final_df.iloc[:, 1:].to_numpy())

# Creating a new column to identify outliers (1 for inliers and -1 for outliers)
final_df['Is_Outlier'] = [1 if x == -1 else 0 for x in final_df['Outlier_Scores']]

# Display the first few rows of the dataframe
final_df.head()

In [None]:
# Calculate the percentage of inliers and outliers
outlier_percentage = final_df['Is_Outlier'].value_counts(normalize=True) * 100

# Plotting the percentage of inliers and outliers
plt.figure(figsize=(12, 4))
outlier_percentage.plot(kind='barh', color='#000080')

# Adding the percentage labels on the bars
for index, value in enumerate(outlier_percentage):
    plt.text(value, index, f'{value:.2f}%', fontsize=15)

plt.title('Percentage of Inliers and Outliers')
plt.xticks(ticks=np.arange(0, 115, 5))
plt.xlabel('Percentage (%)')
plt.ylabel('Is Outlier')
plt.gca().invert_yaxis()
plt.show()

In [None]:
# Separate the outliers for analysis
outliers_data = final_df[final_df['Is_Outlier'] == 1]
outliers_data

In [None]:
# Remove the outliers from the main dataset
mortality_df = final_df[final_df['Is_Outlier'] == 0]

# Drop the 'Outlier_Scores' and 'Is_Outlier' columns
mortality_df = mortality_df.drop(columns=['Outlier_Scores', 'Is_Outlier'])

# Reset the index of the cleaned data
mortality_df.reset_index(drop=True, inplace=True)

In [None]:
mortality_df.info()

In [None]:
#Correlation Analysis
from matplotlib.colors import LinearSegmentedColormap
# Reset background style
sns.set_style('whitegrid')

# Calculate the correlation matrix excluding the 'ID' column
corr = mortality_df.drop(columns=['Entity']).corr()

# Define a custom colormap
colors = ['#0047AB', '#2D65C9', '#5993E5', '#B3D0F4', '#000080']
my_cmap = LinearSegmentedColormap.from_list('custom_map', colors, N=256)

# Create a mask to only show the lower triangle of the matrix (since it's mirrored around its 
# top-left to bottom-right diagonal)
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, k=1)] = True

# Plot the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr, mask=mask, cmap=my_cmap, annot=True, center=0, fmt='.2f', linewidths=2)
plt.title('Correlation Matrix', fontsize=14)
plt.show()

In [None]:
# Assuming df is your DataFrame
corr_matrix = mortality_df.drop(columns=['Entity']).corr()

# Print correlation matrix
print(corr_matrix)


In [None]:
# Assuming df is your DataFrame and 'Year' is the name of the column
mortality_df['Year'] = mortality_df['Year'].astype('category')


In [None]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
# Initialize the StandardScaler
scaler = StandardScaler()

# List of columns that don't need to be scaled
columns_to_exclude = ['Entity', 'Year']

# List of columns that need to be scaled
columns_to_scale = mortality_df.columns.difference(columns_to_exclude)

# Copy the cleaned dataset
mortality_df_scaled = mortality_df.copy()

# Applying the scaler to the necessary columns in the dataset
mortality_df_scaled[columns_to_scale] = scaler.fit_transform(mortality_df_scaled[columns_to_scale])

# Display the first few rows of the scaled data
mortality_df_scaled.head()

In [None]:
mortality_df_scaled

In [None]:
#Dimensionality Reduction(PCA)
from sklearn.decomposition import PCA
# Define the number of principal components to keep
n_components = 6  
mortality_df_scaled1 = mortality_df_scaled.drop(columns=['Entity', 'Year'])
# Perform PCA
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(mortality_df_scaled1)

# Create a DataFrame for the principal components
columns = [f'PC{i}' for i in range(1, n_components + 1)]
df_pca = pd.DataFrame(data=X_pca, columns=columns)

# Add back the country column, if needed
df_pca['Year'] = mortality_df_scaled['Year']

# Display the DataFrame with principal components
print(df_pca.head())

In [None]:
#Set year as Index
df_pca.set_index('Year', inplace=True)
print(df_pca.head())

In [None]:
# Importing necessary libraries
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

#K-Means Clustering
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
# Set plot style, and background color
sns.set(style='darkgrid', rc={'axes.facecolor': '#fcf0dc'})

# Set the color palette for the plot
sns.set_palette(['#ff6200'])

# Instantiate the clustering model with the specified parameters
km = KMeans(init='k-means++', n_init=10, max_iter=100, random_state=0)

# Create a figure and axis with the desired size
fig, ax = plt.subplots(figsize=(12, 5))

# Instantiate the KElbowVisualizer with the model and range of k values, and disable the timing plot
visualizer = KElbowVisualizer(km, k=(2, 15), timings=False, ax=ax)

# Fit the data to the visualizer
visualizer.fit(df_pca)

# Finalize and render the figure
visualizer.show();

In [None]:
def silhouette_analysis(df, start_k, stop_k, figsize=(15, 16)):
    """
    Perform Silhouette analysis for a range of k values and visualize the results.
    """

    # Set the size of the figure
    plt.figure(figsize=figsize)

    # Create a grid with (stop_k - start_k + 1) rows and 2 columns
    grid = gridspec.GridSpec(stop_k - start_k + 1, 2)

    # Assign the first plot to the first row and both columns
    first_plot = plt.subplot(grid[0, :])

    # First plot: Silhouette scores for different k values
    sns.set_palette(['darkorange'])

    silhouette_scores = []

    # Iterate through the range of k values
    for k in range(start_k, stop_k + 1):
        km = KMeans(n_clusters=k, init='k-means++', n_init=10, max_iter=100, random_state=0)
        km.fit(df)
        labels = km.predict(df)
        score = silhouette_score(df, labels)
        silhouette_scores.append(score)

    best_k = start_k + silhouette_scores.index(max(silhouette_scores))

    plt.plot(range(start_k, stop_k + 1), silhouette_scores, marker='o')
    plt.xticks(range(start_k, stop_k + 1))
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Silhouette score')
    plt.title('Average Silhouette Score for Different k Values', fontsize=15)

    # Add the optimal k value text to the plot
    optimal_k_text = f'The k value with the highest Silhouette score is: {best_k}'
    plt.text(10, 0.23, optimal_k_text, fontsize=12, verticalalignment='bottom', 
             horizontalalignment='left', bbox=dict(facecolor='#fcc36d', edgecolor='#ff6200', boxstyle='round, pad=0.5'))
             

    # Second plot (subplot): Silhouette plots for each k value
    colors = sns.color_palette("bright")

    for i in range(start_k, stop_k + 1):    
        km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=0)
        row_idx, col_idx = divmod(i - start_k, 2)

        # Assign the plots to the second, third, and fourth rows
        ax = plt.subplot(grid[row_idx + 1, col_idx])

        visualizer = SilhouetteVisualizer(km, colors=colors, ax=ax)
        visualizer.fit(df)

        # Add the Silhouette score text to the plot
        score = silhouette_score(df, km.labels_)
        ax.text(0.97, 0.02, f'Silhouette Score: {score:.2f}', fontsize=12, \
                ha='right', transform=ax.transAxes, color='red')

        ax.set_title(f'Silhouette Plot for {i} Clusters', fontsize=15)

    plt.tight_layout()
    plt.show()

In [None]:
import matplotlib.gridspec as gridspec
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
silhouette_analysis(df_pca, 3, 12, figsize=(20, 50))

In [None]:
from collections import Counter
#choosing ( k = 3 ) is the better option
# Apply KMeans clustering using the optimal k
kmeans = KMeans(n_clusters=3, init='k-means++', n_init=10, max_iter=100, random_state=0)
kmeans.fit(df_pca)

# Get the frequency of each cluster
cluster_frequencies = Counter(kmeans.labels_)

# Create a mapping from old labels to new labels based on frequency
label_mapping = {label: new_label for new_label, (label, _) in 
                 enumerate(cluster_frequencies.most_common())}

# Reverse the mapping to assign labels as per your criteria
label_mapping = {v: k for k, v in {2: 1, 1: 0, 0: 2}.items()}

# Apply the mapping to get the new labels
new_labels = np.array([label_mapping[label] for label in kmeans.labels_])

# Append the new cluster labels back to the original dataset
mortality_df['cluster'] = new_labels

# Append the new cluster labels to the PCA version of the dataset
df_pca['cluster'] = new_labels

In [None]:
# Calculate the percentage of countries in each cluster
cluster_percentage = (df_pca['cluster'].value_counts(normalize=True) * 100).reset_index()
cluster_percentage.columns = ['Cluster', 'Percentage']
cluster_percentage.sort_values(by='Cluster', inplace=True)

# Create a horizontal bar plot
plt.figure(figsize=(10, 4))
sns.barplot(x='Percentage', y='Cluster', data=cluster_percentage, orient='h', palette=colors)

# Adding percentages on the bars
for index, value in enumerate(cluster_percentage['Percentage']):
    plt.text(value+0.5, index, f'{value:.2f}%')

plt.title('Distribution of Customers Across Clusters', fontsize=14)
plt.xticks(ticks=np.arange(0, 50, 5))
plt.xlabel('Percentage (%)')

# Show the plot
plt.show()

In [None]:
mortality_df.info()

In [None]:
mortality_df['Year'] = pd.to_numeric(mortality_df['Year'], errors='coerce')

# Excluding the 'Country Name' column from the calculation
cluster_averages = mortality_df.drop(columns=['Entity']).groupby('cluster').mean()

# Define a color palette
colors = plt.cm.tab10(np.linspace(0, 1, len(cluster_averages)))

# Plotting separate bar graphs for each column with different colors for clusters
for i, column in enumerate(cluster_averages.columns):
    plt.figure(figsize=(8, 6))
    plt.bar(cluster_averages.index, cluster_averages[column], color=colors)
    plt.title(f'Average {column} by Cluster')
    plt.xlabel('Cluster')
    plt.ylabel(f'Average {column}')
    plt.xticks(cluster_averages.index)
    plt.grid(axis='y')
    plt.tight_layout()
    plt.show()

In [None]:
# Sort the DataFrame by 'Mortality Rate' column
mortality_df_sorted = mortality_df.sort_values(by='Mortality rate, under-5 (per 1,000 live births)')

# Group the sorted DataFrame by 'cluster'
grouped = mortality_df_sorted.groupby('cluster')

# List the top 5 countries in each cluster
for cluster, group in grouped:
    print(f"Cluster {cluster}:")
    top_5_countries = group.head(5)
    print(top_5_countries)
    print()

In [None]:
# Create the 'Risk' column based on the 'cluster' column
mortality_df['Risk'] = np.where(mortality_df['cluster'] == 0, 'High', 'Low')

# Print the DataFrame to verify the changes
print(mortality_df.head())

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Data Preparation
X = mortality_df.drop(['Risk', 'Entity','cluster'], axis=1)  # Features (excluding 'Risk' and 'Entity' columns)
y = mortality_df['Risk']  # Target variable ('Risk')

# Step 2: Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Model Training
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Step 5: Model Evaluation
y_pred = model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Other evaluation metrics
print(classification_report(y_test, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)



In [None]:
# Get the feature names
feature_names = X.columns

# Get the coefficients of the logistic regression model
coefficients = model.coef_[0]

# Create a DataFrame to display the feature importance
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Sort the DataFrame by absolute coefficient values to identify the most important features
feature_importance_df['Absolute Coefficient'] = abs(feature_importance_df['Coefficient'])
feature_importance_df = feature_importance_df.sort_values(by='Absolute Coefficient', ascending=False)

# Print the DataFrame
print(feature_importance_df)
