In [1]:
import pandas as pd
import os

# Load the dataset
file_path = 'D:\\1 code AI'
file_names = ['new_data_v1.csv']
datasets = [pd.read_csv(os.path.join(file_path, file)) for file in file_names]
dataset = pd.concat(datasets, ignore_index=True)

# Count the number of labels 0 and 1
label_counts = dataset['Label'].value_counts()
print("Label counts:")
print(label_counts)

dataset.info()


Label counts:
Label
0    204368
1     34767
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239135 entries, 0 to 239134
Data columns (total 21 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   row     239135 non-null  int64  
 1   col     239135 non-null  int64  
 2   year    239135 non-null  int64  
 3   month   239135 non-null  int64  
 4   day     239135 non-null  int64  
 5   hour    239135 non-null  int64  
 6   B04B    239135 non-null  float64
 7   B05B    239135 non-null  float64
 8   B06B    239135 non-null  float64
 9   B09B    239135 non-null  float64
 10  B10B    239135 non-null  float64
 11  B11B    239135 non-null  float64
 12  B12B    239135 non-null  float64
 13  B14B    239135 non-null  float64
 14  B16B    239135 non-null  float64
 15  I2B     239135 non-null  float64
 16  I4B     239135 non-null  float64
 17  IRB     239135 non-null  float64
 18  VSB     239135 non-null  float64
 19  WVB     239135 n

In [2]:
from sklearn.model_selection import train_test_split

# Check if columns exist before dropping
columns_drop = ["Label", "row", "col", "year", "month", "day", "hour"]
existing_columns_to_drop = [col for col in columns_drop if col in dataset.columns]

X = dataset.drop(columns=existing_columns_to_drop)
y = dataset["Label"]

print(X.head())
print(X.info())
print(y.head())
print(y.value_counts())

       B04B      B05B      B06B       B09B       B10B       B11B       B12B  \
0  0.084788  0.053601  0.030115  253.87752  261.63812  281.48710  260.33334   
1  0.129801  0.103687  0.064551  253.48486  261.10180  279.65836  259.01260   
2  0.088053  0.074731  0.043408  253.62572  261.10180  281.48782  260.33664   
3  0.085429  0.070825  0.040283  253.06673  261.10593  280.73180  259.73105   
4  0.099121  0.091174  0.057520  252.52501  260.70062  280.29822  259.61038   

        B14B       B16B        I2B        I4B        IRB       VSB        WVB  
0  285.00070  269.99298  281.76694  286.36456  285.24005  0.070424  244.27534  
1  282.17892  268.44498  279.95640  286.18780  282.48993  0.099792  244.05240  
2  284.34457  269.33520  281.68080  286.60724  284.50903  0.070424  243.55762  
3  283.88354  269.33356  280.94617  285.77060  284.15130  0.074331  243.33647  
4  283.85870  268.99704  280.34543  287.78340  284.39117  0.082174  242.84953  
<class 'pandas.core.frame.DataFrame'>
RangeIn

In [3]:
# Split the data into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

print("Training set label distribution:")
print(y_train.value_counts())
print("Test set label distribution:")
print(y_test.value_counts())


Training set size: 191308
Test set size: 47827
Training set label distribution:
Label
0    163509
1     27799
Name: count, dtype: int64
Test set label distribution:
Label
0    40859
1     6968
Name: count, dtype: int64


In [None]:
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    # Hyperparameter suggestions
    n_estimators = trial.suggest_int('n_estimators', 50, 600 )
    max_depth = trial.suggest_categorical('max_depth', [None, 10, 100 , 50])
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    class_weight = trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample'])
    
    # Initialize the RandomForestClassifier with the suggested hyperparameters
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        class_weight=class_weight,
        random_state=42,
        n_jobs=-1
    )
    
    # Use RepeatedStratifiedKFold for cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Perform cross-validation and calculate F1 score for each fold
    score = cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1).mean()
    
    return score

# Create the study object and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# Output the best hyperparameters and best score
print("Best hyperparameters:", study.best_params)
print("Best F1 score:", study.best_value)


SyntaxError: positional argument follows keyword argument (846578726.py, line 28)