# Basic ML code

In [1]:
import os
import pickle
import pandas as pd
import numpy as np

import missingno
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import pearsonr

import warnings
warnings.filterwarnings('ignore')

##### 상관관계 분석 _Correlation

<📊 상관관계 해석 기준표 (Correlation Strength)>

| 상관계수 범위       | 해석 내용         |
|---------------------|------------------|
| x ≤ 0.1             | 매우 약한 상관관계 |
| 0.1 < x ≤ 0.3       | 약한 상관관계     |
| 0.3 < x ≤ 0.5       | 중간 정도 상관관계 |
| 0.5 < x ≤ 0.7       | 강한 상관관계     |
| 0.7 < x             | 매우 강한 상관관계 |

In [None]:
## 0. correlation code
from scipy.stats import pearsonr

#ver1.
corr_feature1_all = df.corr(numeric_only=True)['faeture'].sort_values()

#ver2.
corr_feature1_by1 = df['feature'].corr(df['feature'])

#ver3. pearsonr 
correlation, p_value = pearsoner(df[column1], df[columns2])
print(correlation, p_value)

##### Classification ML model _ fit & perdict

>step1. encoding : one-hot encoding & Label Encoding  
step2. ML modeling  
step3. Hyper Parameter Tuning  

In [None]:
## 0. one-hot encoding
# mapping 
gender = {'M':0, 'F':1}
df['gender'] = df['gender'].map(gender)

# pd.get_dummies
df_encoding = pd.get_dummies(df_encoding, columns=['feature1', 'feature2'], dtpye=int)

# 0. label encoding
from sklearn.preprocessing import LabelEncoder
data = ['Red', 'Orange', 'Blue']

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(data)

In [None]:
## 1. Linear & Tree model code 
# 0. split data
from sklearn.model_selection import train_test_split

# 1. Linear model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# 2. Tree model
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# 3. score
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

selected_features = ['feature1', 'feature2']
target = 'traget'

# supervised learning's Label & result 
X = df[selected_features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

# model class

models = {
    "LogisticRegression" : LogisticRegression(),
    "SVC" : SVC(),
    "RandomForest" : RandomForestClassifier(n_estimators=100, random_state=42),
    "GradientBoosting" : GradientBoostingClassifier(n_estimators=100, random_state=42),
    "XGBoost" : XGBClassifier(n_estimators=100, random_state=42),
    "LightGBM" : LGBMClassifier(n_estimators=100, random_state=42),
}

# model fit & predict & evaluation 
results = []

for name, model in models.items():
    model.fit(X_train, y_train) #Label & result
    y_pred = model.predice(X_test)
    acc = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results.append((name, acc, recall, precision, f1))


NameError: name 'df' is not defined

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score

X = df[selected_features]
y = df[target]

# setting evaluation score
f1_scorer = make_scorer(f1_score)

# straitified K-fold
cv = StratifiedKFold(n_split=5, shuffle=True, random_state=42)

# Pre-tunning : Randomized search
random_params = {
    'n_estimators' : [100,200,300],
    'max_depth' : [3,5,7],
    'learning_rate' : np.linspace(0.01, 0.3, 1),
    'min_child_samples' : [10, 20, 30],
    'subsample' : [0.6, 0.8, 1.0],
    'colsample_bytree' : [0.6, 0.8, 1.0],
}

# ML Learning
random_search = RandomizedSearchCV(
    estimator=LGBMClassifier(random_state=42),
    param_distributions=random_params,
    n_iter=10,
    scoring=f1_scorer,
    cv=cv,
    random_state=42,
    verbose=1,
    n_jobs=-1, #-1 GPU 미사용
)

random_search.fit(X,y)

print("Randomized Search CV :", random_search.best_params_)

In [None]:
# final _ ML
best_params = random_search.best_params_
grid_params = {
    'n_estimators' : [best_params['n_estimators']],
    'max_depth' : [best_params['max_depth']],
    'learning_rate' : [best_params['learning_rate']],
    'min_child_samples' : [best_params['min_child_samples']],
    'subsample' : [best_params['subsample']],
    'colsample_bytree' : [best_params['colsample_bytree']],
}

grid_search = GridSearchCV(
    estimator = LGBMClassifier(random_state=42),
    cv = cv,
    param_grid = grid_params,
    scoring = f1_scorer,
    verbose = 1,
    n_jobs = -1
)

grid_search.fit(X, y)

print("Grid Search Best Params:", grid_search.best_params_)
print("Best F1 score:", grid_search.best_score_)

In [None]:
# Saving
champion_model = model.best_estimator_

with open("Champion_model_,,,.pkl", 'wb') as filename:
    pickle.dump(champion_model, filename)

### 부가 기능 : future_importances_ & Confusion Matrix

In [None]:
### 1. Tree 모델에서 사용가능한 feature_importances

model = model(n_estimators=100, random_stats=42)
model.fit(X_train, y_train)

feature_importance = model.feature_importances_
feature_name = X_train.columns

importance_df = pd.DataFrame({
                'feature' : feature_name,
                'importance' : feature_importance.round(2),
                }).set_index('feature').store_values(by='importance', ascending=False)

### visualisation
plt.figure(figsize=(10,6))
sns.barplot(data=importance_df.head(10), x='importance', y='feature')
plt.title("TOP 10 importance fatures in ___Model")
plt.show()

In [None]:
### 2. Confusion Matrix

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# 혼동행렬 : 이진분류 기법에서 사용되는 예측결과를 실제값과 비교해서 네가지 범주로 나눔

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_Labels=[0,1])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix of Final Model")
plt.show()

##### Confusion Matrix
- 이진 분류 문제에서 모델의 예측 결과를 실제값과 비교해서 네가지 범주로 나눔

|실제 값 / 예측값 |   0(예측)   |    1(실제)   |
|----------------|------------|--------------|
|    0 (실제)   | TN(true negative) | FP(false Positive) |
|    1 (실제)   | FN(False negative) | TP(True positive) |

0 = Negative(False), 1 = positive(True)

>TP : 진짜 1인데 1로 예측 (정확도 높음) : 1247  
TN : 진짜 0인에 0으로 예측 (정확도 높음) : 1013  
FP : 실제 0 (지연) 인데 1 (정시도착)로 예측험 (예측은 틀렸으나 고객 입장에서는 좋음_서비스에 좋지도 나쁘지도 않음) : 975  
FN : 실제 1 (정시도착) 인데 0 (지연)으로 예측 (예측이 틀렸고 고객도 부정적인 경험을 함_서비스에 좋지 않음) : 65  

+ 정확도는 FP로 인해서 조금 떨어졌으나 서비스에 부정적인 영향을 미치는 False negative 예측이 적었음으로 활용할 가치가 있는 모델로 보임