In [181]:
# Install optuna
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import KFold,cross_validate
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
#read creditcaed.csv file chunk by chunk since this data is pretty huge up to 280 thousands.
def chunck_read_csv(file_path,chunk_size):
    '''
    read big data chunk by chunk.
    read_columns:a list containing the column names in the file you want to read

    '''
    df=pd.read_csv(file_path,header=0,iterator=True)
    chunks=[]
    loop=True
    while loop:
        try:
            chunk=df.get_chunk(chunk_size)
            chunks.append(chunk)
        except StopIteration:
            loop=False
            print('reading ends')
        df2=pd.concat(chunks,ignore_index=True)
    return df2 
    
df=chunck_read_csv(r'E:\Python datasets\creditcard.csv',10000)

# Before analysis, preprocess the data.
# First ,delete the column 'Time' since it is not useful.
del df['Time']
#Second ,check the existence of NA value.
na_check=df.isnull()
# Count the number of NA value in the whole dataframe
na_check.apply(lambda x: (x == True).sum()).sum()
# Since the above output 0, there is no NA value.
# Also,since the column 'Amount' obviously greater than any other features,it is easier to be overvalued, 
# therefore it need to be rescaled.
df['Amount']=pd.Series(StandardScaler().fit_transform(df['Amount'].values[:,np.newaxis]).ravel())

# From the code below, the class label is extremely unbalanced, therefore, oversampling or udnersampling should be employed,
# since undersampling decrease sample size too much, oversampoling is used.
df['Class'].value_counts()
# Use SMOTE to conduct oversampling
smote=SMOTE(random_state=1412)
x,y=smote.fit_resample(df.iloc[:,:-1],df.iloc[:,-1])
df=pd.concat([x,y],axis=1)
# After oversampling, check the class label, the numbers of each catagory is balanced.
df['Class'].value_counts()

# Since the data has about 30 features, to decrease calculation and avoid overfitting, dimensionality reduction should be employed
pca=PCA(n_components=4,random_state=1412)
x=pca.fit_transform(df.iloc[:,:-1])
# Check the cumulative contribution rate with code below, it is approximately 89.68% thought to be high enough
pca.explained_variance_ratio_.sum()
# Generate dataframe after pca
y=df['Class'].values
data=np.concatenate((x,y[:,np.newaxis]),axis=1)

# Since the sample size is huge, here I use randomforest
# Define the objective function of Bayesian Optimization
def optuna_obj(trial):
# Set parameter range
    n_estimators=trial.suggest_int('n_estimators',10,50)
    max_depth=trial.suggest_int('max_depth',3,100)
# Instancize classifier
    clf=RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,random_state=1412)
# Set reviewer
    cv=KFold(n_splits=10,shuffle=True,random_state=1412)
    val_accuracy=cross_validate(clf,x,y,scoring='accuracy',cv=cv,verbose=True,n_jobs=-1)
    return np.mean(val_accuracy['test_score'])
#Caculate best_patams and best_score
def optuna_optimizer(n_trials):
    study=optuna.create_study(sampler=optuna.samplers.TPESampler(n_startup_trials=20,n_ei_candidates=30),direction='maximize')
    study.optimize(optuna_obj,n_trials=n_trials,show_progress_bar=True)
    return study.best_trial.params,study.best_trial.values
# Since the dataset is pretty huge, even if the Bayesian Optimization still costs pretty much time,
# therefore, for simplicity, 'n_trials' here is set to be 3.
best_params,best_score=optuna_optimizer(3)
print(f'The best parameter combination is: {best_params}\n The best score is: {best_score}')

reading ends


[I 2024-12-21 20:22:15,789] A new study created in memory with name: no-name-c19f8726-2c4b-422b-80b0-eab3c577de9f


  0%|          | 0/3 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  3.3min remaining:  2.2min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  4.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[I 2024-12-21 20:27:03,938] Trial 0 finished with value: 0.9941209573888117 and parameters: {'n_estimators': 40, 'max_depth': 96}. Best is trial 0 with value: 0.9941209573888117.


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   47.1s remaining:   31.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[I 2024-12-21 20:28:14,814] Trial 1 finished with value: 0.9563441956984331 and parameters: {'n_estimators': 17, 'max_depth': 7}. Best is trial 0 with value: 0.9941209573888117.


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  2.2min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.3min finished


[I 2024-12-21 20:31:31,680] Trial 2 finished with value: 0.9940664403918189 and parameters: {'n_estimators': 28, 'max_depth': 90}. Best is trial 0 with value: 0.9941209573888117.
The best parameter combination is: {'n_estimators': 40, 'max_depth': 96}
 The best score is: [0.9941209573888117]


In [None]:
# From the above result,We find the best combination of best parameter and its corresponding best score.
# Then we just need to substitute the result into model and get a good classifier.
final_clf=RandomForestClassifier(n_estimators=best_params['n_estimators'],max_depth=best_params['max_depth'],random_state=1412)