In [11]:
import pandas 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
import sklearn.metrics

## 1st LoG

- **Training**

In [12]:
data_train= pandas.read_csv('./data/data_train_processed.csv').set_index('id')

In [13]:
first_dc_model = LogisticRegression(
    max_iter=1000,
    random_state=42,
)

In [14]:
first_dc_model.fit(
    data_train.drop(['label'],axis='columns'),
    data_train['label']
)

- **Evaluate**

In [15]:
data_test_first = pandas.read_csv('./data/data_test_processed.csv').set_index('id')

In [16]:
first_dc_model.predict(X = data_test_first.drop(['label'],axis='columns'))
data_test_first['prediction']=first_dc_model.predict(
    X = data_test_first.drop(columns = ['label']),
)

In [17]:
sklearn.metrics.confusion_matrix(
    y_true=data_test_first['label'],
    y_pred=data_test_first['prediction'],
)

array([[2537,  496],
       [ 471, 1721]], dtype=int64)

In [18]:
report_scores = sklearn.metrics.classification_report(
    y_true=data_test_first['label'],
    y_pred=data_test_first['prediction'],
    digits = 6,
    output_dict = True
)
df_score = pandas.DataFrame(report_scores).transpose()
df_score

Unnamed: 0,precision,recall,f1-score,support
0.0,0.843418,0.836466,0.839927,3033.0
1.0,0.776274,0.785128,0.780676,2192.0
accuracy,0.814928,0.814928,0.814928,0.814928
macro avg,0.809846,0.810797,0.810302,5225.0
weighted avg,0.815249,0.814928,0.81507,5225.0


## Find new parameters

In [19]:
data_test_findT = pandas.read_csv('./data/data_test_processed.csv').set_index('id')

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# กำหนดพารามิเตอร์ที่ปรับปรุงแล้ว
param_grid = [
    {'penalty': ['l1'], 'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear']},  # l1 ใช้ liblinear
    {'penalty': ['l2'], 'C': [0.01, 0.1, 1, 10, 100], 'solver': ['lbfgs', 'liblinear']}  # l2 ใช้ lbfgs หรือ liblinear
]
target = 'label'
X_train = data_train.drop(columns=[target])
y_train = data_train[target]
X_test = data_test_findT.drop(columns=[target])  # ตัด prediction ออกถ้ามี
y_test = data_test_findT[target]
# ฝึกโมเดลด้วย GridSearchCV
model = LogisticRegression(max_iter=1000)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=2, error_score='raise')  # error_score='raise' เพื่อเดบั๊ก
grid_search.fit(X_train, y_train)

# ผลลัพธ์
print("Best params:", grid_search.best_params_)
print("Best F1 Score from GridSearch:", grid_search.best_score_)

# ใช้โมเดลที่ดีที่สุดในการทำนาย
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
f1_best = f1_score(y_test, y_pred_best)
print("\nF1 Score  Logistic Regression new:", f1_best)

Best params: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best F1 Score from GridSearch: 0.7871772782522889

F1 Score  Logistic Regression new: 0.7811295078249036


## 2nd LoG

- Training

In [21]:
second_dc_model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    C = 1,
    penalty='l1',
    solver='liblinear'
)
second_dc_model.fit(
    data_train.drop(['label'],axis='columns'),
    data_train['label']
)

In [22]:
data_test_second = pandas.read_csv('./data/data_test_processed.csv').set_index('id')
second_dc_model.predict(X = data_test_second.drop(['label'],axis='columns'))
data_test_second['prediction']=second_dc_model.predict(
    X = data_test_second.drop(columns = ['label']),
)

In [23]:
sklearn.metrics.confusion_matrix(
    y_true=data_test_second['label'],
    y_pred=data_test_second['prediction'],
)

array([[2538,  495],
       [ 470, 1722]], dtype=int64)

In [24]:
report_scores2 = sklearn.metrics.classification_report(
    y_true=data_test_second['label'],
    y_pred=data_test_second['prediction'],
    digits = 6,
    output_dict = True
)
df_score2 = pandas.DataFrame(report_scores2).transpose()
df_score2

Unnamed: 0,precision,recall,f1-score,support
0.0,0.84375,0.836795,0.840258,3033.0
1.0,0.776725,0.785584,0.78113,2192.0
accuracy,0.815311,0.815311,0.815311,0.815311
macro avg,0.810238,0.81119,0.810694,5225.0
weighted avg,0.815632,0.815311,0.815452,5225.0


### Class weight

In [25]:
data_test_classWeight = pandas.read_csv('./data/data_test_processed.csv').set_index('id')

In [26]:

wClassWeight_dc_model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    C = 1,
    penalty='l1',
    solver='liblinear',
    class_weight='balanced' 
)
wClassWeight_dc_model.fit(
    data_train.drop(['label'],axis='columns'),
    data_train['label']
)

In [27]:
data_test_classWeight = pandas.read_csv('./data/data_test_processed.csv').set_index('id')
wClassWeight_dc_model.predict(X = data_test_classWeight.drop(['label'],axis='columns'))
data_test_classWeight['prediction']=wClassWeight_dc_model.predict(
    X = data_test_classWeight.drop(columns = ['label']),
)

In [28]:
sklearn.metrics.confusion_matrix(
    y_true=data_test_classWeight['label'],
    y_pred=data_test_classWeight['prediction'],
)

array([[2404,  629],
       [ 345, 1847]], dtype=int64)

In [29]:
report_scores3 = sklearn.metrics.classification_report(
    y_true=data_test_classWeight['label'],
    y_pred=data_test_classWeight['prediction'],
    digits = 6,
    output_dict = True
)
df_score3 = pandas.DataFrame(report_scores3).transpose()
df_score3

Unnamed: 0,precision,recall,f1-score,support
0.0,0.8745,0.792615,0.831546,3033.0
1.0,0.745961,0.842609,0.791345,2192.0
accuracy,0.813589,0.813589,0.813589,0.813589
macro avg,0.810231,0.817612,0.811446,5225.0
weighted avg,0.820575,0.813589,0.814681,5225.0


### SMOTE

In [30]:
from imblearn.over_sampling import SMOTE

In [31]:
data_train_smote = pandas.read_csv('./data/data_train_processed.csv').set_index('id')
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(
    data_train_smote.drop(['label'],axis='columns'),
    data_train_smote['label']
)
smote_model = LogisticRegression(
    max_iter=1000,
    C=1,
    penalty='l1',
    solver='liblinear',
    random_state=42,
)

[WinError 2] The system cannot find the file specified
  File "c:\Users\natth\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


In [32]:
smote_model.fit(X_train_balanced, y_train_balanced)

In [33]:
data_test_smote = pandas.read_csv('./data/data_test_processed.csv').set_index('id')
smote_model.predict(X = data_test_smote.drop(['label'],axis='columns'))
data_test_smote['prediction']=smote_model.predict(
    X = data_test_smote.drop(columns = ['label']),
)

In [34]:
sklearn.metrics.confusion_matrix(
    y_true=data_test_smote['label'],
    y_pred=data_test_smote['prediction'],
)

array([[2403,  630],
       [ 355, 1837]], dtype=int64)

In [35]:
report_scores4 = sklearn.metrics.classification_report(
    y_true=data_test_smote['label'],
    y_pred=data_test_smote['prediction'],
    digits = 6,
    output_dict = True
    
)
df_score4 = pandas.DataFrame(report_scores4).transpose()
df_score4

Unnamed: 0,precision,recall,f1-score,support
0.0,0.871284,0.792285,0.829908,3033.0
1.0,0.744629,0.838047,0.788581,2192.0
accuracy,0.811483,0.811483,0.811483,0.811483
macro avg,0.807956,0.815166,0.809245,5225.0
weighted avg,0.818149,0.811483,0.812571,5225.0


### Summary
LoG with class weight : `0.791345`