# Library

In [21]:
# Classification
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from matplotlib import rc
import statsmodels.api as sm
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import KFold, cross_val_score, train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
import optuna
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import f1_score
from optuna.integration import OptunaSearchCV


In [22]:
!pip install optuna



# Data

In [23]:
path = '/content/drive/MyDrive/Colab Notebooks/data'

train = pd.read_csv(path +'/T_train.csv',encoding='EUC-KR')
test = pd.read_csv(path +'/T_test.csv',encoding='EUC-KR')
ss = pd.read_csv(path +'/T_sample_submission.csv',encoding='EUC-KR')

train.drop(['user_id'], axis=1, inplace=True)
test.drop(['user_id'], axis=1, inplace=True)

# Preprocessing

In [24]:
#단계별로 인코딩
train['preferred_difficulty_level'] = train['preferred_difficulty_level'].map({'Low':0, 'Medium':1,'High':2})
test['preferred_difficulty_level'] = test['preferred_difficulty_level'].map({'Low':0, 'Medium':1,'High':2})

#일반은 0, 프리미엄은 1로 인코딩
train['subscription_type'] = train['subscription_type'].map({'Basic':0, 'Premium':1})
test['subscription_type'] = test['subscription_type'].map({'Basic':0, 'Premium':1})

# Feature Engineering

In [25]:
def fe(df):
    # 평균 로그인 시간 / 가입 기간
    df['login_duration_ratio'] = df['average_login_time'] / (df['subscription_duration'] * 30)

    # 완료한 코스 수 대비 가입 기간
    df['course_duration_ratio'] = df['total_completed_courses'] / (df['subscription_duration'] * 30)

    # 평균 학습 시간 대비 활동적인 학습 일수
    df['learning_time_active_days_ratio'] = df['average_time_per_learning_session'] / df['monthly_active_learning_days']

    # 고객 문의 이력 대비 가입 기간
    df['inquiry_to_duration_ratio'] = df['customer_inquiry_history'] / (df['subscription_duration'] * 30)

    # 가입 기간 대비 중단된 학습 세션 수
    df['abandoned_sessions_to_duration_ratio'] = df['abandoned_learning_sessions'] / (df['subscription_duration']* 30)

    # 월별 이용 시간
    df['monthly_login_time'] = df['average_login_time'] * df['monthly_active_learning_days']

    # 평균 로그인 시간 대비 커뮤니티 레벨
    df['community_login_ratio'] = df['community_engagement_level'] / df['average_login_time']

    # 평균 로그인 시간 대비 완료한 코스 수
    df['login_to_course_ratio'] = df['total_completed_courses'] / df['average_login_time']

    return df


train = fe(train)
test = fe(test)

# Modeling


In [26]:
X = train.drop(['target', 'customer_inquiry_history', 'preferred_difficulty_level'], axis=1)
# -> 0.523
y = train['target']

In [27]:
X.columns

Index(['subscription_duration', 'recent_login_time', 'average_login_time',
       'average_time_per_learning_session', 'monthly_active_learning_days',
       'total_completed_courses', 'recent_learning_achievement',
       'abandoned_learning_sessions', 'community_engagement_level',
       'subscription_type', 'payment_pattern', 'login_duration_ratio',
       'course_duration_ratio', 'learning_time_active_days_ratio',
       'inquiry_to_duration_ratio', 'abandoned_sessions_to_duration_ratio',
       'monthly_login_time', 'community_login_ratio', 'login_to_course_ratio'],
      dtype='object')

In [34]:
# KNN 학습 모델
score_list = []
def knn_objective(trial):

    param_grid = {
        'n_neighbors': trial.suggest_int('n_neighbors', 1, 50),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
        'metric': trial.suggest_categorical('metric',['euclidean', 'manhattan']),
        'p': trial.suggest_int('p', 1, 5)
    }

    model = KNeighborsClassifier(**param_grid)

    fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    scores = []
    for num, (train_idx, valid_idx) in enumerate(fold.split(X, y)):
      X_train, X_test = X.iloc[train_idx], X.iloc[valid_idx]
      y_train, y_test = y.iloc[train_idx], y.iloc[valid_idx]

      model.fit(X_train, y_train)

      preds = model.predict(X_test)

      score = f1_score(y_test, preds, average='macro')

      scores.append(score)

    return np.mean(scores)

In [29]:
study = optuna.create_study(direction='maximize')
study.optimize(knn_objective, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best trial:', study.best_value)

[I 2023-12-18 07:34:36,130] A new study created in memory with name: no-name-d4469cb9-154b-4cb0-9d89-ae6975c80d71
[W 2023-12-18 07:34:36,165] Trial 0 failed with parameters: {'n_neighbors': 44, 'weights': 'uniform', 'metric': 'euclidean', 'p': 1} because of the following error: ValueError('Input X contains NaN.\nKNeighborsClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values').
Traceback (most recent call last):
  File "/usr/local/lib/pyth

ValueError: ignored

In [30]:
# shuffle = True로 했기 때문에 하이퍼파라미터 값은 계속해서 변경 됨 --> 최상의 값 : 0.5267570543989903

In [31]:
# 해당 코드의 파라미터 값이 위 코드 최상의 파라미터값과 다른 이유는 같은 점수값의 모델이 다수이기 때문입니다.
knn_params = {
 'n_neighbors': 1, 'weights': 'distance', 'metric': 'manhattan', 'p': 5
}
knn = KNeighborsClassifier(
  **knn_params
)
knn.fit(X, y)

ValueError: ignored

# Post Precessing

In [32]:
final_test = knn.predict(test[X.columns])
test['target'] = final_test
test['target'].value_counts()

ValueError: ignored

In [33]:
ss['target']= final_test
print(ss['target'].value_counts())
path = "./20231208_knn1.csv"
ss.to_csv(path,index=False)

NameError: ignored