### Pertama kita import dulu library yang akan digunakan disini kita hanya menggunakan pandas

In [1]:
import pandas as pd


### kemudian kita bikin dataframe dengan isi dari dataset yang kita punya

In [2]:
df = pd.read_csv("Star9999_raw.csv")

### Penjelasan tiap Kolom
#### Vmag disini adalah data untuk besarnya bintang
#### Plx jarak bintang dengan bumi
#### e_Plx perkiraan selisih dari jarak asli bintang dengan bumi
#### B-V adalah indeks warna bintang semakin rendah maka semakin panas bintangnya
#### SpType adalah kolom kategorikal bintang

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  9999 non-null   int64  
 1   Vmag        9999 non-null   float64
 2   Plx         9999 non-null   object 
 3   e_Plx       9999 non-null   object 
 4   B-V         9999 non-null   object 
 5   SpType      9722 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 468.8+ KB


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType
0,0,9.1,3.54,1.39,0.482,F5
1,1,9.27,21.9,3.1,0.999,K3V
2,2,6.61,2.81,0.63,-0.019,B9
3,3,8.06,7.75,0.97,0.37,F0V
4,4,8.55,2.87,1.11,0.902,G8III


### Setelah kita cek datanya ternyata terjadi kesalahan tipe data yang seharusnya float malah menjadi object sehingga kita perlu mengubahnya kembali dari object ke float menggunakan pd.to_numeric()

In [5]:
df = df.drop(columns=['Unnamed: 0'])

In [6]:
df['Plx'] = pd.to_numeric(df['Plx'], errors='coerce')
df['e_Plx'] = pd.to_numeric(df['e_Plx'], errors='coerce')
df['B-V'] = pd.to_numeric(df['B-V'], errors='coerce')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Vmag    9999 non-null   float64
 1   Plx     9984 non-null   float64
 2   e_Plx   9984 non-null   float64
 3   B-V     9907 non-null   float64
 4   SpType  9722 non-null   object 
dtypes: float64(4), object(1)
memory usage: 390.7+ KB


# Cleaning

In [8]:
df.isna().sum()

Vmag        0
Plx        15
e_Plx      15
B-V        92
SpType    277
dtype: int64

#### melakukan import SimpleImputer dari library sklearn

In [9]:
from sklearn.impute import SimpleImputer

In [10]:
si_mean = SimpleImputer(strategy='mean')
df[['Plx','e_Plx','B-V']] = si_mean.fit_transform(df[['Plx','e_Plx','B-V']])

#### disini kita menggunakan strategy mean untuk mencari rata rata didalam modul SimpleImputer dimana akan diterapkan ke kolom Plx, e_Plx, B-V untuk mengisi kolom kolom yang kosong

In [11]:
df.head()

Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType
0,9.1,3.54,1.39,0.482,F5
1,9.27,21.9,3.1,0.999,K3V
2,6.61,2.81,0.63,-0.019,B9
3,8.06,7.75,0.97,0.37,F0V
4,8.55,2.87,1.11,0.902,G8III


In [12]:
si_modus = SimpleImputer(strategy='most_frequent')
df[['SpType']] = si_modus.fit_transform(df[['SpType']])

#### disini kita menggunakan strategy most_frequent untuk mencari modus / data yang paling sering muncul yang digunakan untuk mengisi kekosongan data didalam kolom SpType

In [13]:
df.isna().sum()

Vmag      0
Plx       0
e_Plx     0
B-V       0
SpType    0
dtype: int64

# Duplikat

In [14]:
df.duplicated().sum()

0

#### Karena data yang saya punya tidak memiliki duplikat jadinya saya harus menduplikat dulu datanya

In [15]:
df = pd.concat([df]*2, ignore_index=True)

In [16]:
df.duplicated().sum()

9999

In [17]:
df.drop_duplicates(inplace = True)

#### Setelah menduplikat data kemudian saya menggunakan function drop_duplicates(inplace = True) untuk menghapus data yang terduplikat tadi

In [18]:
df.duplicated().sum()

0

### Mengubah tipe data

In [19]:
df['e_Plx'] = df['e_Plx'].astype(int)

#### untuk mengubah tipe data disini saya menggunakan function astype bawaan dari pandas

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9999 entries, 0 to 9998
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Vmag    9999 non-null   float64
 1   Plx     9999 non-null   float64
 2   e_Plx   9999 non-null   int32  
 3   B-V     9999 non-null   float64
 4   SpType  9999 non-null   object 
dtypes: float64(3), int32(1), object(1)
memory usage: 429.6+ KB


# STANDARISASI

#### Pertama kita lakukan import modul StandardScaler dari library sklearn

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
df_old = df

#### disini saya membuat backup dataframe sebelum distandarisasi

In [23]:
df.head()

Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType
0,9.1,3.54,1,0.482,F5
1,9.27,21.9,3,0.999,K3V
2,6.61,2.81,0,-0.019,B9
3,8.06,7.75,0,0.37,F0V
4,8.55,2.87,1,0.902,G8III


In [24]:
ss = StandardScaler()
df[['Vmag','Plx','e_Plx','B-V']] = ss.fit_transform(df[['Vmag','Plx','e_Plx','B-V']])

#### dibawah ini adalah nilai standarisasi dari tiap kolom

In [25]:
for i in df:
    if df[i].dtypes == int or df[i].dtypes == float:
        print("Standarisasi dari", i, "adalah" ,df[i].std())
    else:
        continue

Standarisasi dari Vmag adalah 1.000050008751563
Standarisasi dari Plx adalah 1.000050008751563
Standarisasi dari e_Plx adalah 1.000050008751563
Standarisasi dari B-V adalah 1.000050008751563


# Normalisasi

#### Sebelum melakukan normalisasi data kita perlu menginmport modul MinMaxScaler dari library skelarn

In [26]:
from sklearn.preprocessing import MinMaxScaler

In [27]:
mm = MinMaxScaler()
df[['Vmag','Plx','e_Plx','B-V']] = mm.fit_transform(df[['Vmag','Plx','e_Plx','B-V']])

#### dibawah ini adalah nilai normalisasi dari tiap kolom

In [28]:
for i in df:
    if df[i].dtypes == int or df[i].dtypes == float:
        print("Normalisasi dari", i, "adalah" ,df[i].mean())
    else:
        continue

Normalisasi dari Vmag adalah 0.6263330701244851
Normalisasi dari Plx adalah 0.08655757803827767
Normalisasi dari e_Plx adalah 0.024538564967607875
Normalisasi dari B-V adalah 0.3458854573583165


# One Hot Encoding

#### Sebelum melakukan One Hot Encoding kita perlu mengimport modulnya terlebih dahulu

In [29]:
from sklearn.preprocessing import OneHotEncoder

In [30]:
df['SpType'].value_counts()

K0         1007
G5          623
G0          386
A0          382
F5          368
           ... 
F0Vw...       1
F3V...        1
F6V:          1
K5:           1
K1/K2II       1
Name: SpType, Length: 885, dtype: int64

#### karena data kategorikal saya banyak sehingga saya perlu membuang datanya agar data kategorikalnya tidak terlalu banyak

In [31]:
# Menghitung data yang sama
value_counts = df['SpType'].value_counts()

# Menghapus data yang memiliki value kurang dari 120
to_remove = value_counts[value_counts <= 120].index

# Menyimpan data yang memiliki value lebih dari 120
df = df[~df.SpType.isin(to_remove)]
SpType_value = df['SpType'].value_counts().sort_index()

In [32]:
df['SpType'].value_counts()

K0       1007
G5        623
G0        386
A0        382
F5        368
F8        363
K0III     293
K2        277
F0        219
K5        205
A2        202
K1III     200
F5V       183
K2III     157
F2        155
F3V       146
G8III     135
F7V       122
Name: SpType, dtype: int64

#### Setelah dirasa cukup baru saya melakukan OneHotEncoder

In [33]:
ohe = OneHotEncoder(sparse_output = False)
oh_encoded = ohe.fit_transform(df[['SpType']])

In [34]:
Kolom_Kategori = []
for i in SpType_value.index :
    Kolom_Kategori.append(i)
print(Kolom_Kategori)
df_new = df.reset_index()

['A0', 'A2', 'F0', 'F2', 'F3V', 'F5', 'F5V', 'F7V', 'F8', 'G0', 'G5', 'G8III', 'K0', 'K0III', 'K1III', 'K2', 'K2III', 'K5']


#### disini saya membuat data set baru dimana saya perlu mereset indexnya karena sebelumnya saya menghapus data-data yang memiliki jumlah kategori kurang dari 120

In [35]:
encoded = pd.DataFrame(oh_encoded,
                      columns=Kolom_Kategori)
encoded

Unnamed: 0,A0,A2,F0,F2,F3V,F5,F5V,F7V,F8,G0,G5,G8III,K0,K0III,K1III,K2,K2III,K5
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5419,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### disini saya menggabungkan dataset baru (setelah reset_index) dengan OneHotEncoding saya juga menghapus kolom index

In [36]:
df = pd.concat([df_new,encoded],axis=1)
pd.set_option('display.max_columns', None)
df = df.drop(columns=['index'])
df.head()

Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType,A0,A2,F0,F2,F3V,F5,F5V,F7V,F8,G0,G5,G8III,K0,K0III,K1III,K2,K2III,K5
0,0.674727,0.072745,0.027778,0.263306,F5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.631825,0.0705,0.027778,0.394803,G8III,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.716849,0.120326,0.027778,0.344083,G0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.634945,0.077,0.0,0.446462,G5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.537441,0.075258,0.0,0.137758,A2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Split Data

#### Sebelum melatih dataset kita terlebih dahlu memanggil modul train_test_split dari sklearn

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
X = df.drop(columns='SpType')
Y = df['SpType']

#### disini saya melatih dataset tanpa SpType dimana test_size nya 0.3 atau 30

In [39]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3)

In [40]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(3796, 22)
(1627, 22)
(3796,)
(1627,)
