In [None]:
# Proyek Analisis Data Kualitas Air Sungai di China

In [None]:
## 1. Import Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


In [None]:
## 2. Load Dataset
df = pd.read_csv("/content/china_water_pollution_data.csv")
df.head()

In [None]:
## 3. Assessing Data
print("\n--- Info Dataset ---")
df.info()
print("\n--- Missing Values ---")
print(df.isnull().sum())
print("\n--- Duplicated Rows ---")
print(df.duplicated().sum())

In [None]:
## 4. Cleaning Data
# Drop duplikat
df.drop_duplicates(inplace=True)

# Ubah kolom Date menjadi datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Drop baris yang gagal konversi tanggal
df.dropna(subset=['Date'], inplace=True)

# Isi missing values dengan metode forward fill
df.fillna(method='ffill', inplace=True)

# Konversi nama kolom koordinat
df.rename(columns={"Latitude": "latitude", "Longitude": "longitude"}, inplace=True)

In [None]:
## 5. EDA (Exploratory Data Analysis)
### 5.1 Rata-rata parameter kualitas air
df.describe(numeric_only=True)

In [None]:
### 5.2 Station dengan jumlah data terbanyak
station_counts = df['Monitoring_Station'].value_counts().head(10)
station_counts.plot(kind='barh', title='Top 10 Station dengan Jumlah Data Terbanyak', figsize=(8,5))
plt.xlabel("Jumlah Data")
plt.ylabel("Monitoring Station")
plt.show()

In [None]:
### 5.3 Parameter mana yang paling tinggi rata-ratanya
param_means = df[['Water_Temperature_C', 'Dissolved_Oxygen_mg_L', 'Turbidity_NTU', 'Nitrate_mg_L']].mean()
param_means.plot(kind='pie', autopct='%1.1f%%', title='Proporsi Rata-rata Parameter Air')
plt.ylabel('')
plt.show()

In [None]:
## 6. Visualisasi Interaktif
### 6.1 Line Chart: Suhu Air dari waktu ke waktu
fig = px.line(df, x='Date', y='Water_Temperature_C', color='Monitoring_Station', title='Tren Suhu Air per Station')
fig.show()

In [None]:
### 6.2 Bar Chart: Top 5 Station dengan suhu tertinggi
top5_temp = df.groupby('Monitoring_Station')['Water_Temperature_C'].mean().sort_values(ascending=False).head(5)
fig2 = px.bar(top5_temp, x=top5_temp.values, y=top5_temp.index, orientation='h', title='Top 5 Station - Rata-rata Suhu Air')
fig2.show()

In [None]:
### 6.3 Pie Chart: Komposisi rata-rata parameter air
pie_df = pd.DataFrame({
    'Parameter': param_means.index,
    'Rata-rata': param_means.values
})
fig3 = px.pie(pie_df, names='Parameter', values='Rata-rata', title='Komposisi Rata-rata Parameter Air')
fig3.show()

In [None]:
### 6.4 Map: Lokasi Monitoring Station
fig4 = px.scatter_mapbox(
    df.dropna(subset=['latitude', 'longitude']),
    lat="latitude", lon="longitude",
    color="Monitoring_Station",
    mapbox_style="carto-positron",
    zoom=3,
    title="Peta Lokasi Monitoring Station"
)
fig4.show()

In [None]:
## 7. Korelasi antar Parameter
plt.figure(figsize=(10,6))
sns.heatmap(df[['Water_Temperature_C', 'pH', 'Dissolved_Oxygen_mg_L', 'Turbidity_NTU', 'Nitrate_mg_L']].corr(), annot=True, cmap='coolwarm')
plt.title("Heatmap Korelasi antar Parameter Kualitas Air")
plt.show()

In [None]:
## 8. Simpulan Awal
# - Station dengan data terbanyak: {}
# - Parameter dominan: {}
# - Korelasi negatif kuat: suhu & oksigen terlarut (semakin panas, oksigen berkurang)
# Simpulan dan insight lanjutan dapat dikembangkan sesuai kebutuhan.