In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [30]:

# Load the data
data = pd.read_csv('./raw_data/ml_data_new.csv', index_col=0)
data = data.loc[:,['last_tx_in_days','date_creation_diff','gas','gas_price','is_published','verified']]

# Separate features and target
X = data.drop(columns=['verified'])
y = data['verified']


# Standardize the features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

Creating train, validation and test data sets

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

Hyper Parameter tuning for RF

In [40]:
# Define parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [2,3,5,7],
    'max_samples':[30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize GridSearchCV with RandomForestClassifier
grid_rf = GridSearchCV(RandomForestClassifier(), 
                       param_grid_rf, 
                       cv=5, 
                       scoring='f1')

# Fit GridSearchCV for Random Forest
grid_rf.fit(X_train, y_train)

# Get the best parameters for Random Forest
best_params_rf = grid_rf.best_params_
best_params_rf

{'max_depth': 7,
 'max_samples': 30,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}

Hyper Parameter tuning for XGBoost

In [38]:
# Define parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Initialize GridSearchCV with XGBoost
grid_xgb = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 
                        param_grid_xgb, 
                        cv=5, 
                        scoring='f1')

# Fit GridSearchCV
grid_xgb.fit(X_train, y_train)

# Get the best parameters for XGBoost
best_params_xgb = grid_xgb.best_params_
best_params_xgb

{'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300}

Evaluating diffrent models

In [35]:
# Initialize models
models = [
    ('Random Forest', RandomForestClassifier(**best_params_rf)),
    ('XGBoost',XGBClassifier(use_label_encoder=False, eval_metric='logloss', **best_params_xgb)),
    ('Logistic Regression', LogisticRegression()),
    ('SVM', SVC()),
    ('Decision Tree', DecisionTreeClassifier())
]

# Initialize scoring metrics
scoring = {
    'F1-score': make_scorer(f1_score),
    'Accuracy': make_scorer(accuracy_score),
    'Recall': make_scorer(recall_score),
    'Precision': make_scorer(precision_score)
}

# Initialize KFold cross-validator
kfold = StratifiedKFold(n_splits=5)

results = []

# For each model
for name, model in models:
    # Initialize metric storage for current model
    current_results = {'Model': name}
    # For each metric
    for score_name, scorer in scoring.items():
        # Calculate mean metric score over 5 folds
        score = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scorer).mean()
        # Store score
        current_results[score_name] = score
    # Add current model's results to overall results
    results.append(current_results)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

results_df

Unnamed: 0,Model,F1-score,Accuracy,Recall,Precision
0,Random Forest,0.87202,0.82311,0.894098,0.852765
1,XGBoost,0.87082,0.818616,0.894153,0.849214
2,Logistic Regression,0.81384,0.712079,0.917377,0.731724
3,SVM,0.84499,0.766522,0.927432,0.776233
4,Decision Tree,0.84554,0.784525,0.847869,0.864331


Evaluating the best model with the test data

In [36]:
# Find the model with the highest F1-score
best_model_name = results_df.loc[results_df['F1-score'].idxmax(), 'Model']

# Find the corresponding model in the models list
best_model = next((model for name, model in models if name == best_model_name), None)

# Fit the best model on the training data
best_model.fit(X_train, y_train)

# Evaluate the model on the test data
test_predictions = best_model.predict(X_test)

# Calculate and print the metrics on the test data
test_f1 = f1_score(y_test, test_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_precision = precision_score(y_test, test_predictions)

print(f'\nBest model: {best_model_name}')
print(f'Test F1-score: {test_f1}')
print(f'Test Accuracy: {test_accuracy}')
print(f'Test Recall: {test_recall}')
print(f'Test Precision: {test_precision}')


Best model: Random Forest
Test F1-score: 0.8792270531400966
Test Accuracy: 0.831081081081081
Test Recall: 0.91
Test Precision: 0.8504672897196262
