**Import Library**

In [1]:
import pandas as pd
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from rgf.sklearn import RGFClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

**Load Dataset**

In [2]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,...,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20,col_21,CreditScore
0,22,14,8557.39,797.115833,8,6,22,2,37,15.0,...,0,1767.29,25.400823,143,2,13.168404,91.952879,3.0,264.590301,2
1,37,1,19718.92,1676.243333,10,9,19,6,27,11.0,...,2,2057.56,28.642449,197,2,64.066440,107.668408,3.0,285.889485,2
2,33,13,32045.78,2677.481667,6,9,30,7,10,10.0,...,2,1333.18,30.053861,76,2,169.770374,62.681178,4.0,285.296615,2
3,42,4,62976.28,5321.023333,0,3,12,0,9,0.0,...,1,68.66,40.661773,191,1,0.000000,70.780837,4.0,711.321496,1
4,39,2,57818.72,4864.226667,7,7,21,2,56,16.0,...,0,2348.77,37.882655,174,2,73.709570,395.136222,3.0,307.576874,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85265,23,6,76230.76,6159.563333,6,10,31,3,35,15.0,...,2,1961.73,34.222384,242,2,15026.000000,176.547124,1.0,539.747953,2
85266,46,0,110607.09,8997.257500,4,4,2,3,2,9.0,...,1,1174.05,24.781202,265,1,154.824136,232.178801,5.0,752.722813,0
85267,50,8,96275.84,8101.986667,6,6,19,1,7,18.0,...,2,999.36,28.339005,321,0,64.961337,129.831967,,855.405363,0
85268,55,9,69388.26,5543.355000,2,4,6,0,8,1.0,...,1,819.98,40.497795,266,1,0.000000,184.212607,1.0,640.122893,2


**Check some information about the data**

**Show The Number Of Observations**

In [370]:
# col_2, col_13, col_17, col_20 are categorical variables for df
# col_2: 0, 1, 2, ..., 15
# col_13: 0, 1, 2, 3
# col_17: 0, 1, 2
# col_20: 0, 1, 2, 3, 4, 5

#print('df shape:', df.shape)
#print('df_test shape:', df_test.shape)


21

**missing value**

In [3]:
df.isnull().sum()
#df_test.isnull().sum()

col_1              0
col_2              0
col_3              0
col_4          12817
col_5              0
col_6              0
col_7              0
col_8              0
col_9              0
col_10          6004
col_11          1799
col_12          1661
col_13             0
col_14             0
col_15             0
col_16             0
col_17             0
col_18             0
col_19          3836
col_20          6502
col_21          1024
CreditScore        0
dtype: int64

In [4]:
# Fill the mean and median to deal with the missing value 
# We should clean col_4, 10, 11, 12, 19, 20, 21
# Since col_20 is categorical value, we use median
# Others are numeric features, we use mean
# If testing is not well, we can try put most frequent to fill in missing value

col_median_fill = ['col_4', 'col_10', 'col_11', 'col_12', 'col_19', 'col_20', 'col_21']

for col in col_median_fill:
    df[col].fillna(df[col].median(), inplace = True)
    df_test[col].fillna(df_test[col].median(), inplace = True)


In [5]:
df.isnull().sum()

col_1          0
col_2          0
col_3          0
col_4          0
col_5          0
col_6          0
col_7          0
col_8          0
col_9          0
col_10         0
col_11         0
col_12         0
col_13         0
col_14         0
col_15         0
col_16         0
col_17         0
col_18         0
col_19         0
col_20         0
col_21         0
CreditScore    0
dtype: int64

**One Hot Encoding on Categorical Variable**

In [6]:
# Do one-hot encoding on the categorical data col_2, col_13, col_17, col_20
col_to_encoded = ['col_2','col_13', 'col_17', 'col_20']
df_preprocessed = pd.get_dummies(df, columns = col_to_encoded)
df_test_preprocessed = pd.get_dummies(df_test, columns = col_to_encoded)

print('len of df_preprocessed:', len(df_preprocessed.columns.tolist()))
print('len of df_test_preprocessed:', len(df_test_preprocessed.columns.tolist()))

len of df_preprocessed: 47
len of df_test_preprocessed: 46


**Split Data And Deal with data imbalance**

In [7]:
X = df_preprocessed.drop('CreditScore', axis = 1)
y = df_preprocessed['CreditScore']

# Get training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

print('training data size:', len(X_train))
print('testing data set:', len(X_test))

# Oversampling on the training data
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)


training data size: 68216
testing data set: 17054


In [8]:
print(y_train_resampled.value_counts())
print('y_test:', y_test.value_counts())

2    37355
0    37355
1    37355
Name: CreditScore, dtype: int64
y_test: 0    9383
2    4939
1    2732
Name: CreditScore, dtype: int64


**Standardization**

In [9]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train_resampled)
X_test_std = sc.transform(X_test)
df_test_std = sc.transform(df_test_preprocessed)

**Random Forest(RF)**

In [67]:
# Create a Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=200)
# Train
rfc.fit(X_train_std, y_train_resampled)

# Predict
y_pred_rfc = rfc.predict(X_test_std)

#classification_report(y_test, y_pred_rfc)
# Evaluation
print('RFC result:', y_pred_rfc)
print('Misclassified samples: %d' %(y_test != y_pred_rfc).sum())
print('Accuracy: %.4f' %accuracy_score(y_test, y_pred_rfc)) 

# Show the F1-score
f1_scores = {}
for class_label in set(y_pred_rfc):
    y_true_f1 = [1 if label == class_label else 0 for label in y_test]
    y_pred_f1 = [1 if label == class_label else 0 for label in y_pred_rfc]

    f1_scores[class_label] = f1_score(y_true_f1, y_pred_f1)

for class_label, f in f1_scores.items():
    print("F1-score for class {}:{:.4f}".format(class_label, f))

avg_f1_score = sum(f1_scores.values()) / len(f1_scores)
print("Average F1 score across all classes: {:.4f}".format(avg_f1_score))


RFC result: [0 0 0 ... 0 1 0]
Misclassified samples: 4181
Accuracy: 0.7548
F1-score for class 0:0.7859
F1-score for class 1:0.6586
F1-score for class 2:0.7572
Average F1 score across all classes: 0.7339


**Pick Something Important And Repredict again**

In [27]:
from sklearn.feature_selection import SelectFromModel

In [72]:
# Use selector to select feature
selector = SelectFromModel(rfc, prefit=True, threshold = '0.9*mean')
X_train_selected = selector.transform(X_train_std)
X_test_selected = selector.transform(X_test_std)
df_test_selected = selector.transform(df_test_std)
# Create a Random Forest Classifier
rfc_filtered = RandomForestClassifier(n_estimators=150)
# Train
rfc_filtered.fit(X_train_selected, y_train_resampled)

# Predict
y_filtered_pred_rfc = rfc_filtered.predict(X_test_selected)

# Evaluation
print('RFC result:', y_filtered_pred_rfc)
print('Misclassified samples: %d' %(y_test != y_filtered_pred_rfc).sum())
print('Accuracy: %.4f' %accuracy_score(y_test, y_filtered_pred_rfc)) 

# Show the F1-score
f1_scores = {}
for class_label in set(y_filtered_pred_rfc):
    y_true_f1 = [1 if label == class_label else 0 for label in y_test]
    y_pred_f1 = [1 if label == class_label else 0 for label in y_filtered_pred_rfc]

    f1_scores[class_label] = f1_score(y_true_f1, y_pred_f1)

for class_label, f in f1_scores.items():
    print("F1-score for class {}:{:.4f}".format(class_label, f))

avg_f1_score = sum(f1_scores.values()) / len(f1_scores)
print("Average F1 score across all classes: {:.4f}".format(avg_f1_score))


RFC result: [0 0 1 ... 0 1 0]
Misclassified samples: 3744
Accuracy: 0.7805
F1-score for class 0:0.7984
F1-score for class 1:0.7096
F1-score for class 2:0.7912
Average F1 score across all classes: 0.7664


**LightGBM**

In [13]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

In [14]:
# Set the parameters
params = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'learning_rate': 0.1,
    'num_leaves': 31,  # 可調整
    'verbose': -1  # 這會減少大量的輸出信息
}

GBM_train_data = lgb.Dataset(X_train_std, label = y_train_resampled)
GBM_test_data = lgb.Dataset(X_test_std, label=y_test, reference = GBM_train_data)

# Create LightGBM classifier
LGBM = LGBMClassifier(objective = 'multiclass',
                      num_class = 3,
                      learning_rate = 0.1,
                      n_estimators = 100,
                      num_leaves = 31,
                      subsample = 0.8,
                      colsample_bytree = 0.8,
                      reg_lambda = 1,
                      reg_alpha = 0.1,
                      random_state = 42)

LGBM.fit(X_train_std,
         y_train_resampled, 
         eval_set=[(X_test_std, y_test)],
         eval_metric='multi_logloss')

y_pred_LGBM = LGBM.predict(X_test_std)

# Evaluation
print('GBM result:', y_pred_LGBM)
print('Misclassified samples: %d' %(y_test != y_pred_LGBM).sum())
print('Accuracy: %.4f' %accuracy_score(y_test, y_pred_LGBM)) 


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011702 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4232
[LightGBM] [Info] Number of data points in the train set: 112065, number of used features: 46
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
GBM result: [0 0 0 ... 0 1 0]
Misclassified samples: 4737
Accuracy: 0.7222


**XGBoost**

In [65]:
# max_depth set 20 is better
XGB = xgb.XGBClassifier(objective = 'multi:softmax', 
                        num_class = 3, 
                        learning_rate = 0.15, 
                        n_estimators = 300,
                        max_depth = 6,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        reg_lambda = 1,
                        reg_alpha = 0.1,
                        use_label_encoder = False,
                        eval_metric = 'mlogloss')

XGB.fit(X_train_std, y_train_resampled)

y_pred_xgb = XGB.predict(X_test_std)

# Evaluation
print('XGB result:', y_pred_xgb)
print('Misclassified samples: %d' %(y_test != y_pred_xgb).sum())
print('Accuracy: %.4f' %accuracy_score(y_test, y_pred_xgb)) 

# Show the F1-score
f1_scores = {}
for class_label in set(y_pred_xgb):
    y_true_f1 = [1 if label == class_label else 0 for label in y_test]
    y_pred_f1 = [1 if label == class_label else 0 for label in y_pred_xgb]

    f1_scores[class_label] = f1_score(y_true_f1, y_pred_f1)

for class_label, f in f1_scores.items():
    print("F1-score for class {}:{:.4f}".format(class_label, f))

avg_f1_score = sum(f1_scores.values()) / len(f1_scores)
print("Average F1 score across all classes: {:.4f}".format(avg_f1_score))

XGB result: [0 0 0 ... 0 1 0]
Misclassified samples: 4051
Accuracy: 0.7625
F1-score for class 0:0.7906
F1-score for class 1:0.6824
F1-score for class 2:0.7544
Average F1 score across all classes: 0.7425


**Predict test.csv using XGBoost**

In [64]:
fin_y_pred = XGB.predict(df_test_std)
fin_y_pred
fin_df = pd.DataFrame({'label': fin_y_pred})
fin_df.to_csv('xgb_output_0416.csv', index = True, index_label = 'Id')

**Pick Some Important Feature And Repredict**

In [17]:
XGB_feature_importance = XGB.feature_importances_
col_name = X.columns.tolist()

threshold = 0.006
XGB_feature_importance_col = [i for i, v in enumerate(XGB_feature_importance) if v >= threshold]
X_train_std_selected = X_train_std[:, XGB_feature_importance_col]
X_test_std_selected = X_test_std[:, XGB_feature_importance_col]

XGB_selected = xgb.XGBClassifier(objective = 'multi:softmax', 
                                 num_class = 3, 
                                 learning_rate = 0.1, 
                                 n_estimators = 100,
                                 max_depth = 20,
                                 subsample = 0.8,
                                 colsample_bytree = 0.8,
                                 reg_lambda = 1,
                                 reg_alpha = 0.1,
                                 use_label_encoder = False,
                                 eval_metric = 'mlogloss')

XGB_selected.fit(X_train_std_selected, y_train_resampled)
y_pred_xgb_sel = XGB_selected.predict(X_test_std_selected)

# Evaluation
print('XGB_selected result:', y_pred_xgb_sel)
print('Misclassified samples: %d' %(y_test != y_pred_xgb_sel).sum())
print('Accuracy: %.4f' %accuracy_score(y_test, y_pred_xgb_sel)) 

XGB_selected result: [0 0 2 ... 0 1 0]
Misclassified samples: 4304
Accuracy: 0.7476


**RGF**

In [34]:
# Create rgf classifier
rgf = RGFClassifier(
    max_leaf = 1000,
    algorithm = "RGF_Opt",
    test_interval = 100,
    verbose = True)

# Train
rgf.fit(X_train_std, y_train_resampled)

# Predict
y_pred_rgf = rgf.predict(X_test_std)

# Evaluation
print('RGF_selected result:', y_pred_rgf)
print('Misclassified samples: %d' %(y_test != y_pred_rgf).sum())
print('Accuracy: %.4f' %accuracy_score(y_test, y_pred_rgf)) 



"predict": 
   model_fn=/var/folders/fy/4rjyh9dd1xndnwk2_jfvf7lw0000gn/T/rgf/6a3e0b2e-fb7a-44bd-9478-7bd9532a0e2a1.model-10
   test_x_fn=/var/folders/fy/4rjyh9dd1xndnwk2_jfvf7lw0000gn/T/rgf/6a3e0b2e-fb7a-44bd-9478-7bd9532a0e2a1.test.data.x
   prediction_fn=/var/folders/fy/4rjyh9dd1xndnwk2_jfvf7lw0000gn/T/rgf/6a3e0b2e-fb7a-44bd-9478-7bd9532a0e2a1.predictions.txt
   Log:ON
--------------------
Tue Apr 16 20:42:34 2024: Reading test data ... 
Tue Apr 16 20:42:34 2024: Predicting ... 
elapsed: 0.070895
/var/folders/fy/4rjyh9dd1xndnwk2_jfvf7lw0000gn/T/rgf/6a3e0b2e-fb7a-44bd-9478-7bd9532a0e2a1.predictions.txt: /var/folders/fy/4rjyh9dd1xndnwk2_jfvf7lw0000gn/T/rgf/6a3e0b2e-fb7a-44bd-9478-7bd9532a0e2a1.model-10,#leaf=1000,#tree=79
Tue Apr 16 20:42:34 2024: Done ... 

"predict": 
   model_fn=/var/folders/fy/4rjyh9dd1xndnwk2_jfvf7lw0000gn/T/rgf/665bceba-78e2-46c5-9cec-653d08426c8f2.model-10
   test_x_fn=/var/folders/fy/4rjyh9dd1xndnwk2_jfvf7lw0000gn/T/rgf/665bceba-78e2-46c5-9cec-653d08426c8f2.tes

**NN**

In [32]:
from sklearn.neural_network import MLPClassifier

In [33]:
# Initialize the Multi-layer Perceptron classifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), 
                    activation='relu', 
                    solver='adam', 
                    alpha=0.0001, 
                    batch_size='auto', 
                    learning_rate='constant', 
                    learning_rate_init=0.001, 
                    max_iter=200)

# Fit the model to the training data
mlp.fit(X_train_std, y_train_resampled)

# Predict on the test data
y_pred_nn = mlp.predict(X_test_std)

# Evaluation
print('NN result:', y_pred_nn)
print('Misclassified samples: %d' %(y_test != y_pred_nn).sum())
print('Accuracy: %.4f' %accuracy_score(y_test, y_pred_nn)) 

NN result: [0 0 1 ... 0 0 0]
Misclassified samples: 5563
Accuracy: 0.6738




**Predict test.csv**

In [71]:
fin_y_pred = rfc_filtered.predict(df_test_selected)
fin_y_pred
fin_df = pd.DataFrame({'label': fin_y_pred})
fin_df.to_csv('rfc_output_0416.csv', index = True, index_label = 'Id')
