In [1]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture as gm
from sklearn.impute import SimpleImputer as si

In [2]:
df = pd.read_csv('calon_tni2.csv')

print(df)

print("\n\n","="*40)
print("=== Menampilkan missing value per kolom===")
print("="*40)
missingData = df.isnull().sum()
print(missingData)
print("="*40)

    nama                    alamat  tinggi jenis_kelamin        gaji
0   Andi        Jl. Merdeka No. 10   175.0             L     5000000
1   Budi        Jl. Sudirman No. 5   170.0             L     4500000
2  Citra        Jl. Thamrin No. 20   180.0             P     6000000
3   Dewi  Jl. Gatot Subroto No. 15   180.0             P     5500000
4    Eka     Jl. Rasuna Said No. 8   183.0             L     7000000
5    NaN       Jl. Kuningan No. 12     NaN             L     4800000
6   Gina          Jl. Cikini No. 3   170.0             P     9000000
7   Hadi        Jl. Palmerah No. 7   168.0             L  1000000000
8  Indah         Jl. Senayan No. 9   178.0             P     2000000
9  Jilan        Jl. Kuningan No. 2   160.0             P     3000000


=== Menampilkan missing value per kolom===
nama             1
alamat           0
tinggi           1
jenis_kelamin    0
gaji             0
dtype: int64


In [18]:
print("Rata-rata tinggi:", np.mean(df['tinggi']))
print("Max tinggi:", np.max(df['tinggi']))
print("Min tinggi:", np.min(df['tinggi']))
print("Standar deviasi tinggi:", np.std(df['tinggi']))
print()
print("Rata-rata gaji:", np.mean(df['gaji']))
print("Max gaji:", np.max(df['gaji']))
print("Min gaji:", np.min(df['gaji']))
print("Standar deviasi gaji:", np.std(df['gaji']))

Rata-rata tinggi: 173.77777777777777
Max tinggi: 183.0
Min tinggi: 160.0
Standar deviasi tinggi: 6.9406656385517405

Rata-rata gaji: 104680000.0
Max gaji: 1000000000
Min gaji: 2000000
Standar deviasi gaji: 298445726.3892381


In [21]:
print("\n\n","="*40)
print("=== Hapus informasi missing value ===")
print("="*40)
df_cleaned = df.dropna()
print(df_cleaned)

print("\n\n","="*40)
print("=== Menghapus data yang merusak ===")
print("="*40)
df_no_outliers = df_cleaned[(df_cleaned["gaji"] != 1000000000)]
print(df_no_outliers)



=== Hapus informasi missing value ===
    nama                    alamat  tinggi jenis_kelamin        gaji
0   Andi        Jl. Merdeka No. 10   175.0             L     5000000
1   Budi        Jl. Sudirman No. 5   170.0             L     4500000
2  Citra        Jl. Thamrin No. 20   180.0             P     6000000
3   Dewi  Jl. Gatot Subroto No. 15   180.0             P     5500000
4    Eka     Jl. Rasuna Said No. 8   183.0             L     7000000
6   Gina          Jl. Cikini No. 3   170.0             P     9000000
7   Hadi        Jl. Palmerah No. 7   168.0             L  1000000000
8  Indah         Jl. Senayan No. 9   178.0             P     2000000
9  Jilan        Jl. Kuningan No. 2   160.0             P     3000000


=== Menghapus data yang merusak ===
    nama                    alamat  tinggi jenis_kelamin     gaji
0   Andi        Jl. Merdeka No. 10   175.0             L  5000000
1   Budi        Jl. Sudirman No. 5   170.0             L  4500000
2  Citra        Jl. Thamrin No. 20

In [28]:
data = np.array([
    [1, 2],
    [2, 3],
    [None, 6],
    [6, 8],
    [None, None],
    [3, 4]
])

imputer = si(strategy='mean')
dataImputed = imputer.fit_transform(data)
print(dataImputed)
GaMi = gm(n_components=2, max_iter=100, random_state=42)
GaMi.fit(dataImputed)

print("Rerata atau mean dari tiap komponen Gaussian:")
print(GaMi.means_)
print("\nCovariance dari tiap komponen Gaussian:")
print(GaMi.covariances_)

[[1.  2. ]
 [2.  3. ]
 [3.  6. ]
 [6.  8. ]
 [3.  4.6]
 [3.  4. ]]
Rerata atau mean dari tiap komponen Gaussian:
[[4.4999995  6.99999967]
 [2.25000062 3.40000076]]

Covariance dari tiap komponen Gaussian:
[[[2.250001   1.5       ]
  [1.5        1.000001  ]]

 [[0.68750322 0.80000272]
  [0.80000272 0.98000434]]]
