In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 라이브러리

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('darkgrid')

import missingno as msno
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder , StandardScaler
from sklearn import metrics

In [None]:
path = '/content/drive/MyDrive/dacon/2023.11.23/dataset'

orig = pd.read_csv(path + '/train.csv')
train = pd.read_csv(path + '/train.csv')
test = pd.read_csv(path + '/test.csv')

In [None]:
train.info()

In [None]:
# 데이터 유효성 검사

def val_table(df):
    # 타입
    dtypes = df.dtypes

    # 유니크
    data_nunique = df.nunique()

    # 결측값
    data_nan = df.isna().sum()

    table = pd.concat([dtypes , data_nunique , data_nan] , axis = 1)
    table.columns = ['dtype' , 'nunique' , 'nan']

    return table.reset_index()

In [None]:
# train데이터

val_table(train)

In [None]:
# test데이터

val_table(test)

In [None]:
# 간단한 전처리

train.describe().style.background_gradient(cmap = 'summer_r')

In [None]:
mapping = {
    "Low" : 0 ,
    "Medium" : 1 ,
    "High" : 2
}

train['preferred_difficulty_level'] = train['preferred_difficulty_level'].map(mapping)
test['preferred_difficulty_level'] = test['preferred_difficulty_level'].map(mapping)

encoder = LabelEncoder()
train['subscription_type'] = encoder.fit_transform(train[['subscription_type']])
test['subscription_type'] = encoder.transform(test[['subscription_type']])

In [None]:
y = train['target']
train.drop(['target'] , axis = 1 , inplace = True)

In [None]:
train.drop(['user_id'] , axis = 1 , inplace = True)
test.drop(['user_id'] , axis = 1 , inplace = True)

In [None]:
# 스케일링

scaler = StandardScaler()
scaled_train = scaler.fit_transform(train)
scaled_train = pd.DataFrame(scaled_train , columns = train.columns)

In [None]:
sns.clustermap(scaled_train.corr() , annot = True , fmt = '.2f' , cmap = 'summer_r')

In [None]:
train.drop(['total_completed_courses' , 'community_engagement_level'] , axis = 1 , inplace = True)
test.drop(['total_completed_courses' , 'community_engagement_level'] , axis = 1 , inplace = True)

In [None]:
# baseline model

trainX , validX , trainY , validY = train_test_split(scaled_train , y , test_size = 0.2 , random_state = 42)

model = RandomForestClassifier()
model.fit(trainX , trainY)

In [None]:
# predict

prediction = model.predict(validX)

In [None]:
print(f"macro f1 : {metrics.f1_score(validY , prediction , average = 'macro')}")
print(f"recall : {metrics.recall_score(validY , prediction , average = 'macro')}")
print(f"precision : {metrics.precision_score(validY , prediction , average = 'macro')}")

In [None]:
# submission

submission = pd.read_csv(path + '/sample_submission.csv')

In [None]:
submission_prediction = model.predict(test)

In [None]:
submission['target'] = submission_prediction

In [None]:
# csv파일로 저장

submission.to_csv(path + '/submission.csv' , index = False)