In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/SW-AI/예선/

/content/drive/.shortcut-targets-by-id/1u2gcBrJ7BYQsonAiqsFzWDmRnzNwjQ4T/SW-AI/예선


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
sns.set()
import warnings
warnings.filterwarnings('ignore')

In [None]:
train  = pd.read_csv('./competition_data/train.csv', index_col = 0)
test  = pd.read_csv('./competition_data/test.csv', index_col = 0)

In [None]:
train.columns

Index(['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11',
       'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21',
       'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'country', 'introelapse',
       'testelapse', 'surveyelapse', 'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4',
       'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10', 'VCL1', 'VCL2',
       'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8', 'VCL9', 'VCL10',
       'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16', 'education',
       'urban', 'gender', 'engnat', 'age', 'hand', 'religion', 'orientation',
       'voted', 'married', 'familysize', 'ASD', 'nerdiness'],
      dtype='object')



---



In [None]:
def Mach(data):
  Q_cols = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10',
          'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20',
          'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']
  Q_pp = ['Q1', 'Q2', 'Q5', 'Q6', 'Q8', 'Q12', 'Q13', 'Q15', 'Q19', 'Q20', 'Q21', 'Q24', 'Q25', 'Q26']
  Q_nn = ['Q3', 'Q4', 'Q7', 'Q9', 'Q10', 'Q11', 'Q14', 'Q16', 'Q17', 'Q18', 'Q22', 'Q23']
  data[Q_cols] = data[Q_cols].fillna(3)
  for col in Q_nn: 
    data[col] = 6 - data[col]
    data['Mach'] = data[Q_cols].mean(axis = 1)

Q1~Q26 마키아벨리즘 전처리

In [None]:
def TIPI(data):
  TIPI_cols = ['TIPI1', 'TIPI2', 'TIPI3', 'TIPI4',
             'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10']
  data[TIPI_cols] = data[TIPI_cols].fillna(3)
  data['T1'] = ( data['TIPI3'] + (8 - data['TIPI8']) ) / 2
  data['T2'] = ( data['TIPI7'] + (8 - data['TIPI2']) ) / 2
  data['T3'] = ( data['TIPI9'] + (8 - data['TIPI4']) ) / 2
  data['T4'] = ( data['TIPI5'] + (8 - data['TIPI10']) ) / 2
  data['T5'] = ( data['TIPI1'] + (8 - data['TIPI6']) ) / 2

TIPI1~TIPI10 전처리

In [None]:
def VCL(data):
  VCL_cols = ['VCL1', 'VCL2', 'VCL3', 'VCL4', 'VCL5',
            'VCL6', 'VCL7', 'VCL8', 'VCL9', 'VCL10',
            'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16']
  data[VCL_cols] = data[VCL_cols].fillna(0.5)
  # VCL 결과 합산해 컬럼 추가
  data['VCL'] = data[VCL_cols].sum(axis = 1)
  data['VCL']

VCL1~VCL16 전처리

In [None]:
def Elapse(data):
  elapse_cols = ['introelapse', 'testelapse', 'surveyelapse']
  for col in elapse_cols:
    median = data[col].median()
    data[f'{col[:-6]}'] = data[col].apply(lambda x: 1 if x > median else 0) # 빠름 0, 느림 1
  data['elapse'] = data['intro'] + data['test'] + data['survey']

응답시간 전처리

In [None]:
def ASD(data):
  data['ASD'] = data['ASD'].fillna(1)

ASD null값 처리



---



In [None]:
Mach(train)
TIPI(train)
VCL(train)
Elapse(train)
ASD(train)

In [None]:
Mach(test)
TIPI(test)
VCL(test)
Elapse(test)
ASD(test)

In [None]:
cols = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11',
       'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21',
       'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'introelapse',
       'testelapse', 'surveyelapse', 'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4',
       'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10', 'VCL1', 'VCL2',
       'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8', 'VCL9', 'VCL10',
       'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16', 'education',
       'urban', 'gender', 'engnat', 'age', 'hand', 'religion', 'orientation',
       'voted', 'married', 'familysize', 'ASD']

In [None]:
for col in cols:
 train[col] = train[col].fillna(train[col].median())
 test[col] = test[col].fillna(test[col].median())

나머지 null값 처리



---



In [None]:
train_x = train[['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11',
       'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21',
       'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4',
       'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10', 'VCL1', 'VCL2',
       'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8', 'VCL9', 'VCL10',
       'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16', 'education',
       'urban', 'gender', 'engnat', 'age', 'hand', 'religion', 'orientation',
       'voted', 'married', 'familysize', 'ASD', 'elapse']]
train_y = train['nerdiness']
test = test[['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11',
       'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21',
       'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4',
       'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10', 'VCL1', 'VCL2',
       'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8', 'VCL9', 'VCL10',
       'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16', 'education',
       'urban', 'gender', 'engnat', 'age', 'hand', 'religion', 'orientation',
       'voted', 'married', 'familysize', 'ASD', 'elapse']]

In [None]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
lgbm_clf = LGBMClassifier(
            n_estimators=1500
        )
lgbm_clf.fit(train_x, train_y)

lgbm_pred = lgbm_clf.predict(test)

LGBM 사용

In [None]:
submission = pd.read_csv('competition_data/sample_submission.csv')
submission["nerdiness"] = lgbm_pred
submission.to_csv("baseline.csv", index = False)

제출