<a href="https://colab.research.google.com/github/Denianjas/setup-environment-praktikum/blob/main/analisis_data_kematian_rokok_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📊 Analisis Data Kematian Akibat Rokok (WHO)
Dataset ini berisi data jumlah kematian akibat rokok di beberapa negara.
Kita akan melakukan prediksi dan clustering menggunakan Machine Learning.

In [None]:
# 📥 1. Import Library yang Dibutuhkan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [None]:
# 📂 2. Load Dataset
df = pd.read_csv('Data_Kematian_Rokok_WHO.csv')
df.head()  # Menampilkan 5 baris pertama

In [None]:
# 🔍 3. Eksplorasi Data
print(df.info())  # Informasi dataset
print(df.describe())  # Statistik deskriptif
sns.pairplot(df, diag_kind='kde')  # Visualisasi hubungan antar variabel
plt.show()

In [None]:
# 🔢 4. Prediksi Angka Kematian dengan Regresi Linear
X = df[['Year', 'Smokers_Percentage']]
y = df['Deaths_Tobacco']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f'MAE: {mae}, MSE: {mse}')

In [None]:
# 📈 5. Visualisasi Prediksi vs Aktual
plt.figure(figsize=(8,5))
plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
plt.xlabel('Angka Kematian Aktual')
plt.ylabel('Prediksi Angka Kematian')
plt.title('Prediksi vs Aktual')
plt.show()

In [None]:
# 🏢 6. Clustering Negara Berdasarkan Kematian Akibat Rokok
df_cluster = df[['Country', 'Deaths_Tobacco']].copy()
df_cluster.set_index('Country', inplace=True)
kmeans = KMeans(n_clusters=3, random_state=42)
df_cluster['Cluster'] = kmeans.fit_predict(df_cluster)
sns.scatterplot(x=df_cluster.index, y=df_cluster['Deaths_Tobacco'], hue=df_cluster['Cluster'], palette='viridis', s=100)
plt.xticks(rotation=90)
plt.title('Clustering Negara Berdasarkan Kematian Akibat Rokok')
plt.show()