In [None]:
!kaggle competitions download -c plant-pathology-2021-fgvc8

In [77]:
import pandas as pd
import numpy as np
import sklearn.feature_selection as featselect 
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

In [78]:
df_train = pd.read_csv("./dataset/raw/train.csv")
df_test = pd.read_csv("./dataset/raw/test.csv")

In [79]:
df_train.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [80]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144368 entries, 0 to 144367
Data columns (total 94 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   id       144368 non-null  int64
 1   feat_1   144368 non-null  int64
 2   feat_2   144368 non-null  int64
 3   feat_3   144368 non-null  int64
 4   feat_4   144368 non-null  int64
 5   feat_5   144368 non-null  int64
 6   feat_6   144368 non-null  int64
 7   feat_7   144368 non-null  int64
 8   feat_8   144368 non-null  int64
 9   feat_9   144368 non-null  int64
 10  feat_10  144368 non-null  int64
 11  feat_11  144368 non-null  int64
 12  feat_12  144368 non-null  int64
 13  feat_13  144368 non-null  int64
 14  feat_14  144368 non-null  int64
 15  feat_15  144368 non-null  int64
 16  feat_16  144368 non-null  int64
 17  feat_17  144368 non-null  int64
 18  feat_18  144368 non-null  int64
 19  feat_19  144368 non-null  int64
 20  feat_20  144368 non-null  int64
 21  feat_21  144368 non-null  int64
 

In [81]:
def missing_precentage(df):
    missing = df.isnull().sum()
    percentage = missing / len(df) * 100

    # Gabungkan ke 1 tabel
    missing_table = pd.DataFrame({
        "Missing Values": missing,
        "Percentage (%)": percentage
    })

    return missing_table

In [82]:
missing_precentage(df_train)

Unnamed: 0,Missing Values,Percentage (%)
id,0,0.0
feat_1,0,0.0
feat_2,0,0.0
feat_3,0,0.0
feat_4,0,0.0
...,...,...
feat_90,0,0.0
feat_91,0,0.0
feat_92,0,0.0
feat_93,0,0.0


In [83]:
missing_precentage(df_test)

Unnamed: 0,Missing Values,Percentage (%)
id,0,0.0
feat_1,0,0.0
feat_2,0,0.0
feat_3,0,0.0
feat_4,0,0.0
...,...,...
feat_89,0,0.0
feat_90,0,0.0
feat_91,0,0.0
feat_92,0,0.0


In [84]:
df_train["target"].value_counts()

target
Class_2    16122
Class_6    14135
Class_8     8464
Class_3     8004
Class_9     4955
Class_7     2839
Class_5     2739
Class_4     2691
Class_1     1929
Name: count, dtype: int64

In [85]:
def feature_selection(df, target_col, 
                               var_threshold=0.01,
                               mi_threshold=0.01,
                               rf_threshold=0.01,
                               min_methods=2,
                               random_state=42):
    """
    Ensemble feature selection untuk target string multi-class.

    Args:
        df: DataFrame input
        target_col: nama kolom target (string)
        var_threshold: threshold variance
        mi_threshold: threshold mutual info
        rf_threshold: threshold RF importance
        min_methods: minimal metode yang harus memilih fitur
        random_state: seed

    Returns:
        final_features: list fitur terpilih
        scores: DataFrame ringkasan tiap fitur dan metode
    """
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # Simpan skor tiap metode
    feature_scores = pd.DataFrame(index=X.columns)
    
    # ---- 1. Variance Threshold ----
    sel_var = featselect.VarianceThreshold(threshold=var_threshold)
    sel_var.fit(X.fillna(0))  # isi NaN agar aman
    var_selected = X.columns[sel_var.get_support()]
    feature_scores['Variance'] = X.var()
    feature_scores['Var_selected'] = feature_scores.index.isin(var_selected)
    
    # ---- 2. Mutual Information ----
    mi = featselect.mutual_info_classif(X.fillna(0), y, discrete_features='auto', random_state=random_state)
    mi_series = pd.Series(mi, index=X.columns)
    mi_selected = mi_series[mi_series >= mi_threshold].index
    feature_scores['Mutual_Info'] = mi_series
    feature_scores['MI_selected'] = feature_scores.index.isin(mi_selected)
    
    # ---- 3. Random Forest Classifier ----
    rf = RandomForestClassifier(n_estimators=100, random_state=random_state)
    rf.fit(X.fillna(0), y)
    importances = pd.Series(rf.feature_importances_, index=X.columns)
    rf_selected = importances[importances >= rf_threshold].index
    feature_scores['RF_importance'] = importances
    feature_scores['RF_selected'] = feature_scores.index.isin(rf_selected)
    
    # ---- Hitung jumlah metode yang memilih ----
    feature_scores['Methods_selected'] = feature_scores[['Var_selected', 'MI_selected', 'RF_selected']].sum(axis=1)
    
    # ---- Ambil fitur yang lolos ≥ min_methods ----
    final_features = feature_scores[feature_scores['Methods_selected'] >= min_methods].index.tolist()
    feature_scores['Final_selected'] = feature_scores.index.isin(final_features)
    
    return final_features, feature_scores


In [86]:

selected_features, summary = feature_selection(df_train, target_col='target',
                                                        var_threshold=0.1,
                                                        mi_threshold=0.1,
                                                        rf_threshold=0.1,
                                                        min_methods=2)

print("Fitur terpilih:", selected_features)
print("Jumlah Fitur:", selected_features.count )
print(summary.head())

Fitur terpilih: ['id', 'feat_3', 'feat_4', 'feat_11', 'feat_14', 'feat_15', 'feat_25', 'feat_26', 'feat_27', 'feat_34', 'feat_35', 'feat_40', 'feat_46', 'feat_54', 'feat_60', 'feat_67', 'feat_69', 'feat_80', 'feat_86', 'feat_88', 'feat_90']
Jumlah Fitur: <built-in method count of list object at 0x00000133809A0500>
            Variance  Var_selected  Mutual_Info  MI_selected  RF_importance  \
id      3.190791e+08          True     1.950258         True       0.376254   
feat_1  2.326630e+00          True     0.030226        False       0.002918   
feat_2  1.567688e+00          True     0.079375        False       0.003107   
feat_3  8.613156e+00          True     0.140222         True       0.007238   
feat_4  7.772970e+00          True     0.136391         True       0.007846   

        RF_selected  Methods_selected  Final_selected  
id             True                 3            True  
feat_1        False                 1           False  
feat_2        False                 1    

In [96]:
# --- Ambil Top 9 Features ---
top9_features = selected_features[:9]   # ambil 9 fitur teratas

# Simpan dataset dengan top 9 features
df_train_top9 = df_train[['id', 'target'] + top9_features]
df_test_top9  = df_test[['id'] + top9_features]   # test tidak ada target

# Simpan ke CSV
df_train_top9.to_csv("./dataset/train_top9.csv", index=False)
df_test_top9.to_csv("./dataset/test_top9.csv", index=False)

In [75]:
df_train = df_train[selected_features + ["target"]]
df_test = df_test[selected_features]

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61878 entries, 0 to 61877
Data columns (total 22 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       61878 non-null  int64 
 1   feat_3   61878 non-null  int64 
 2   feat_4   61878 non-null  int64 
 3   feat_11  61878 non-null  int64 
 4   feat_14  61878 non-null  int64 
 5   feat_15  61878 non-null  int64 
 6   feat_25  61878 non-null  int64 
 7   feat_26  61878 non-null  int64 
 8   feat_27  61878 non-null  int64 
 9   feat_34  61878 non-null  int64 
 10  feat_35  61878 non-null  int64 
 11  feat_40  61878 non-null  int64 
 12  feat_46  61878 non-null  int64 
 13  feat_54  61878 non-null  int64 
 14  feat_60  61878 non-null  int64 
 15  feat_67  61878 non-null  int64 
 16  feat_69  61878 non-null  int64 
 17  feat_80  61878 non-null  int64 
 18  feat_86  61878 non-null  int64 
 19  feat_88  61878 non-null  int64 
 20  feat_90  61878 non-null  int64 
 21  target   61878 non-null  object
dty

In [91]:
id_target_columns = ["id", "target"]
id_columns = ["id"]
X_selected_train =  df_train[selected_features]
X_selected_test =  df_test[selected_features]

pca = PCA(n_components = 9, random_state = 42)
X_reduced_train = pca.fit_transform(X_selected_train)
X_reduced_test = pca.transform(X_selected_test)

df_reduced_train = pd.DataFrame(X_reduced_train, columns=[f'PC{i+1}' for i in range(X_reduced_train.shape[1])])
df_reduced_train[id_target_columns] = df_train[id_target_columns].values


df_reduced_test = pd.DataFrame(X_reduced_test, columns=[f'PC{i+1}' for i in range(X_reduced_test.shape[1])])
df_reduced_test[id_columns] = df_test[id_columns].values

print(df_reduced_train.head())
print(df_reduced_test.head())


            PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0 -30938.499058 -0.435573  2.241465  6.232274 -3.530157 -4.074177  5.400965   
1 -30937.499666 -0.225300  0.966979 -0.071118 -4.114746 -3.823691  0.452058   
2 -30936.499406 -1.713545  1.982236  4.522753 -3.796163 -3.930054  0.338950   
3 -30935.499299  1.555503  2.408057  4.510723 -2.602892 -3.416053  0.454611   
4 -30934.499669 -0.025284  1.508311 -1.199180 -4.192118 -3.811726  0.516887   

        PC8       PC9 id   target  
0 -5.694393  6.107224  1  Class_1  
1 -2.766964 -0.935552  2  Class_1  
2 -2.276769 -0.784661  3  Class_1  
3 -2.050847 -0.688174  4  Class_1  
4 -2.859962 -1.091252  5  Class_1  
            PC1        PC2       PC3       PC4       PC5        PC6  \
0 -30938.500768  -1.372003  1.097334 -1.500701  4.035793   0.110703   
1 -30937.497803  26.695465  7.390134  6.751880 -0.599168   1.884083   
2 -30936.498469  14.936058  5.549211  3.683317 -2.173078  -4.063749   
3 -30935.500889   1.27291

In [95]:
df_reduced_train.to_csv("./dataset/cleaned/train_reduced.csv")
df_reduced_test.to_csv("./dataset/cleaned/test_reduced.csv")