In [None]:
import numpy as np
import pandas as pd
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.ensemble import VotingClassifier

from sklearn.semi_supervised import SelfTrainingClassifier

from sklearn.metrics import classification_report

from tqdm import tqdm

In [None]:
class TrainingMLModels:
    def __init__(self):
        self.random_state_num=0
        
    def trainingSupvisedMLModel(self, x_train,x_test,y_train,y_test):
        #의사결정나무
        clf_decision= DecisionTreeClassifier(random_state=self.random_state_num, max_depth=7)
        clf_randomF= RandomForestClassifier(random_state=self.random_state_num, max_depth=7)
        clf_kneighbors= KNeighborsClassifier(n_neighbors=2, weights='distance', leaf_size=50)
        clf_logistic= LogisticRegression(max_iter=5000,random_state=self.random_state_num)
        clf_mlp= MLPClassifier(solver='lbfgs',alpha=0.5,max_iter=100,random_state=self.random_state_num)
        clf_mlp= MLPClassifier(solver='lbfgs',alpha=0.5,max_iter=100,random_state=self.random_state_num)
        clf_xgboost= XGBClassifier(random_state=self.random_state_num, max_depth=7)
        clf_ensemble=VotingClassifier(estimators=[('xgoboost',clf_xgboost),('randomforest',clf_randomF),('logistic',clf_logistic)], voting='soft')
        
        models = {'decision':clf_decision, 'kneighbors':clf_kneighbors, 'randomF': clf_randomF, 'logistic':clf_logistic, 'mlp':clf_mlp, 'xgboost':clf_xgboost, 'ensemble_soft': clf_ensemble}
        
        for models_nm, model_obj in tqdm(models.items(), desc="training ml classification model"):
            print(f"----------{models_nm}을 진행합니다")
            #모델 훈련
            model_obj.fit(x_train, y_train)
            #모델 저장
            #dump(model_obj, f"./mode_data/{models_nm}.joblib")
            #모델 예측 도출
            y_train_pred= model_obj.predict(x_train)
            y_test_pred=model_obj.predict(x_test)
            #모델의 성능 평가
            print(f"-----모델 훈련 결과 ----")
            print(classification_report(y_train,y_train_pred))
            print(f"-----테스트 결과 ----")
            print(classification_report(y_test,y_test_pred))

In [None]:
tmm_csl=TrainingMLModels()

In [130]:
train_df=pd.read_csv('ratings_train.csv')

test_df = pd.read_csv('ratings_test.csv')

train_df.dropna(inplace=True)

test_df.dropna(inplace=True)

In [131]:
## 모델 훈련시 사용할 랜덤 변수(계속 모델 훈련해도 고정된 결과가 나옴)
train_df_num=1000 #훈련 데이터 중에서 훈련에 사용할 데이터 수
test_df_num= 500 #테스트 데이터 중에서 훈련에 사용할 데이터 

In [132]:
# 1. 데이터 분리
x_train, y_train = train_df['document'][:train_df_num], train_df['label'][:train_df_num]
x_test, y_test = test_df['document'][:test_df_num], test_df['label'][:test_df_num]

In [133]:
from nlp_preprocessing import TrainTransfromVect

In [134]:
ttv = TrainTransfromVect()

In [135]:
ttv.fit_run('wp', x_train)



In [136]:
wp_vec_train= ttv.transform_run(x_train, chunk_size=100)
wp_vec_test= ttv.transform_run(x_train, chunk_size=100)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 170.46it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 172.40it/s]


## 모델 훈련부분

In [137]:
t_ml_model_cls= TrainingMLModels()

In [None]:
t_ml_model_cls.trainingSupvisedMLModel(wp_vec_train, wp_vec_test, y_train,y_test)