In [1]:
import pandas as pd
import numpy as np


In [2]:

file_path = 'data_kotor.csv'
try:
    df = pd.read_csv(file_path)
    print("--- DATA AWAL (SEBELUM DIBERSIHKAN) ---")
    print(df.head())
    print("\n--- Info Awal (Tipe Data & Missing) ---")
    df.info()
    print("\n--- Jumlah Duplikat Awal ---")
    print(f"Total baris duplikat: {df.duplicated().sum()}")
except Exception as e:
    print(f"Error saat membaca file: {e}")
    exit()


--- DATA AWAL (SEBELUM DIBERSIHKAN) ---
   Rank  Peak All Time Peak  Actual gross Adjusted gross (in 2022 dollars)  \
0     1     1             2  $780,000,000                     $780,000,000   
1     2     1          7[2]  $579,800,000                     $579,800,000   
2     3  1[4]          2[5]  $411,000,000                     $560,622,615   
3     4  2[7]         10[7]  $397,300,000                     $454,751,555   
4     5  2[4]           NaN  $345,675,146                     $402,844,849   

         Artist                   Tour title    Year(s)  Shows Average gross  \
0  Taylor Swift              The Eras Tour †  2023–2024     56   $13,928,571   
1       Beyoncé       Renaissance World Tour       2023     56   $10,353,571   
2       Madonna  Sticky & Sweet Tour ‡[4][a]  2008–2009     85    $4,835,294   
3          Pink  Beautiful Trauma World Tour  2018–2019    156    $2,546,795   
4  Taylor Swift      Reputation Stadium Tour       2018     53    $6,522,173   

  Ref.  
0

In [3]:


print("\n--- MEMULAI PEMBERSIHAN ---")


df_clean = df.copy()


if 'Ref.' in df_clean.columns:
    df_clean = df_clean.drop(columns=['Ref.'])
    print("1. Kolom 'Ref.' telah dihapus.")


if 'Peak' in df_clean.columns:
    # Ubah jadi string -> split -> ambil bagian pertama
    df_clean['Peak'] = df_clean['Peak'].astype(str).str.split('[').str[0]
    # Ubah jadi numerik
    df_clean['Peak'] = pd.to_numeric(df_clean['Peak'], errors='coerce')

if 'All Time Peak' in df_clean.columns:
    df_clean['All Time Peak'] = df_clean['All Time Peak'].astype(str).str.split('[').str[0]
    df_clean['All Time Peak'] = pd.to_numeric(df_clean['All Time Peak'], errors='coerce')
    

currency_cols = ['Actual gross', 'Adjusted gross (in 2022 dollars)', 'Average gross']
for col in currency_cols:
    if col in df_clean.columns:
        cleaned_col = df_clean[col].astype(str).str.replace('$', '')
        cleaned_col = cleaned_col.str.replace(',', '')
        
        df_clean[col] = pd.to_numeric(cleaned_col, errors='coerce')

if 'Shows' in df_clean.columns:
     df_clean['Shows'] = pd.to_numeric(df_clean['Shows'], errors='coerce')

print("2. Kolom teks/mata uang telah dibersihkan dan diubah ke numerik (tanpa 're').")


for col in df_clean.columns:
    if pd.api.types.is_numeric_dtype(df_clean[col]):
        median_val = df_clean[col].median()
        df_clean[col] = df_clean[col].fillna(median_val)
    elif pd.api.types.is_object_dtype(df_clean[col]):
        mode_val = df_clean[col].mode()
        if not mode_val.empty:
            df_clean[col] = df_clean[col].fillna(mode_val[0])
        else:
            df_clean[col] = df_clean[col].fillna("Unknown")

print("3. Missing values (NaN) telah diisi (imputed).")
print("\n--- Cek Missing Values (Setelah) ---")
print(df_clean.isnull().sum())

baris_duplikat_awal = df_clean.duplicated().sum()
df_clean = df_clean.drop_duplicates(keep='first')
baris_duplikat_akhir = df_clean.duplicated().sum()

print(f"4. Data duplikat telah ditangani. (Dihapus: {baris_duplikat_awal} baris)")
print(f"   Jumlah duplikat sekarang: {baris_duplikat_akhir}")




--- MEMULAI PEMBERSIHAN ---
1. Kolom 'Ref.' telah dihapus.
2. Kolom teks/mata uang telah dibersihkan dan diubah ke numerik (tanpa 're').
3. Missing values (NaN) telah diisi (imputed).

--- Cek Missing Values (Setelah) ---
Rank                                0
Peak                                0
All Time Peak                       0
Actual gross                        0
Adjusted gross (in 2022 dollars)    0
Artist                              0
Tour title                          0
Year(s)                             0
Shows                               0
Average gross                       0
dtype: int64
4. Data duplikat telah ditangani. (Dihapus: 1 baris)
   Jumlah duplikat sekarang: 0


In [4]:

output_file = 'student_scores_cleaned.csv'
try:
    df_clean.to_csv(output_file, index=False)
    print(f"\n--- BERHASIL DISIMPAN ---")
    print(f"Data bersih telah disimpan ke: {output_file}")

    print("\n--- DATA AKHIR (SETELAH DIBERSIHKAN) ---")
    print(df_clean.head())
    print("\n--- Info Akhir (Tipe Data & Missing) ---")
    df_clean.info()

except Exception as e:
    print(f"Error saat menyimpan file: {e}")


--- BERHASIL DISIMPAN ---
Data bersih telah disimpan ke: student_scores_cleaned.csv

--- DATA AKHIR (SETELAH DIBERSIHKAN) ---
   Rank  Peak  All Time Peak  Actual gross Adjusted gross (in 2022 dollars)  \
0     1   1.0            2.0  $780,000,000                     $780,000,000   
1     2   1.0            7.0  $579,800,000                     $579,800,000   
2     3   1.0            2.0  $411,000,000                     $560,622,615   
3     4   2.0           10.0  $397,300,000                     $454,751,555   
4     5   2.0            8.5  $345,675,146                     $402,844,849   

         Artist                   Tour title    Year(s)  Shows  Average gross  
0  Taylor Swift              The Eras Tour †  2023–2024     56       13928571  
1       Beyoncé       Renaissance World Tour       2023     56       10353571  
2       Madonna  Sticky & Sweet Tour ‡[4][a]  2008–2009     85        4835294  
3          Pink  Beautiful Trauma World Tour  2018–2019    156        2546795 