## 파일불러오기

In [159]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')
from google.colab import drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## 전처리

In [165]:
train  = pd.read_csv('/content/gdrive/MyDrive/competition_data/train.csv')
test  = pd.read_csv('/content/gdrive/MyDrive/competition_data/test.csv')

# country컬럼 Mapping
country_map = {}

for i, c in enumerate(train['country'].unique()):
    country_map[c] = i

train['country'] = train['country'].map(country_map)
test['country'] = test['country'].map(country_map) 


# 이상치는 0으로(age, introelapse, testelapse, surveryelapse, familysize 컬럼)
train.loc[train.age > 80, 'age'] = 0
train.loc[train.familysize > 50, 'familysize'] = 0
train.loc[train.introelapse <= train.introelapse.quantile(0.025), 'introelapse'] = 0
train.loc[train.testelapse <= train.testelapse.quantile(0.025), 'testelapse'] = 0
train.loc[train.surveyelapse <= train.surveyelapse.quantile(0.025), 'surveyelapse'] = 0
train.loc[train.introelapse >= train.introelapse.quantile(0.975), 'introelapse'] = 0
train.loc[train.testelapse >= train.testelapse.quantile(0.975), 'testelapse'] = 0
train.loc[train.surveyelapse >= train.surveyelapse.quantile(0.975), 'surveyelapse'] = 0


test.loc[test.age > 80, 'age'] = 0
test.loc[test.familysize > 50, 'familysize'] = 0
test.loc[test.introelapse <= test.introelapse.quantile(0.025), 'introelapse'] = 0
test.loc[test.testelapse <= test.testelapse.quantile(0.025), 'testelapse'] = 0
test.loc[test.surveyelapse <= test.surveyelapse.quantile(0.025), 'surveyelapse'] = 0
test.loc[test.introelapse >= test.introelapse.quantile(0.975), 'introelapse'] = 0
test.loc[test.testelapse >= test.testelapse.quantile(0.975), 'testelapse'] = 0
test.loc[test.surveyelapse >= test.surveyelapse.quantile(0.975), 'surveyelapse'] = 0


# TIPI 점수
flipping_columns2 = ['TIPI2', 'TIPI4', 'TIPI6', 'TIPI8', 'TIPI10']
for flip in flipping_columns2: 
    train[flip] = 8 - train[flip]
    test[flip] = 8 - test[flip]

train['Extraversion'] = (train['TIPI1'] + train['TIPI6'])/2 
train['Agreeableness'] = (train['TIPI2'] + train['TIPI7'])/2
train['Conscientiousness'] = (train['TIPI3'] + train['TIPI8'])/2
train['EmotionalStability'] = (train['TIPI4'] + train['TIPI9'] )/2
train['OpennesstoExperiences'] = (train['TIPI5'] + train['TIPI10'] )/2 

test['Extraversion'] = (test['TIPI1'] + test['TIPI6'])/2 
test['Agreeableness'] = (test['TIPI2'] + test['TIPI7'])/2
test['Conscientiousness'] = (test['TIPI3'] + test['TIPI8'])/2
test['EmotionalStability'] = (test['TIPI4'] + test['TIPI9'] )/2
test['OpennesstoExperiences'] = (test['TIPI5'] + test['TIPI10'] )/2 

In [166]:
# index컬럼 제거
train = train.drop(['index'],axis = 1)
test =test.drop(['index'],axis = 1)

# 결측지 처리 (비어있는 건 0으로 채움)
train = train.fillna(0)
test = test.fillna(0)

# train을 target과 feature로 나눠줍니다.
train_x=train.drop(['nerdiness'], axis=1)
train_y=train['nerdiness']

# 정규화(StandardScaler사용).
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(train_x)

scaled_train_x = scaler.transform(train_x)
train_x = pd.DataFrame(scaled_train_x,columns=train_x.columns)

scaled_test_x = scaler.transform(test)
test = pd.DataFrame(scaled_test_x,columns=train_x.columns)

count    15000.000000
mean         2.163867
std          0.728274
min          0.000000
25%          2.000000
50%          2.000000
75%          3.000000
max          3.000000
Name: urban, dtype: float64


## 교차검증

In [167]:
from sklearn.model_selection import StratifiedKFold #교차검증
from sklearn.ensemble import RandomForestClassifier #모델
from sklearn.model_selection import GridSearchCV #하이퍼파라미터 튜닝을 위한 불러오기

skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=11) # 5-fold, 성능이 높아지는 것을 확인하기위해 random_state값 고정

forest_accuracy=[]

for train_index, test_index in skf.split(train_x, train_y):
    label_train= train_y.iloc[train_index]
    label_test= train_y.iloc[test_index]

    forest_model = RandomForestClassifier(
      n_estimators=1000, 
      random_state=42, 
      n_jobs=-1,
      max_depth=25,
    )

    # 그리드 서치 알고리즘(최적의 파라미터 값 찾는데 이용)
    # parameters = {'n_estimators':[900,1000,1100],'max_depth':[24,25,26]}
    # grid = GridSearchCV(forest_model, parameters,scoring='accuracy')
    # grid.fit(train_x.iloc[train_index], train_y.iloc[train_index])        # 학습
    # print(grid.best_params_)                                              # -> 가장 최적의 파라미터값 출력
    # pred = grid.score(train_x.iloc[test_index],label_test)                # 평가

    forest_model.fit(train_x.iloc[train_index], train_y.iloc[train_index])  # 학습
    pred = forest_model.score(train_x.iloc[test_index],label_test)          # 평가
    forest_accuracy.append(pred)
    print(pred)

print("\n")
print("mean:",np.mean(forest_accuracy)) # 평균성능

0.78
0.7986666666666666
0.797
0.7973333333333333
0.7996666666666666


mean: 0.7945333333333333


## 모델돌리기

In [168]:
from sklearn.ensemble import RandomForestClassifier

forest_model = RandomForestClassifier(
  n_estimators=1000, 
  random_state=42, 
  n_jobs=-1,
  max_depth=25,
)

forest_model.fit(train_x, train_y)            # 학습
pred = forest_model.predict_proba(test)[:,1]  # 평가 - 확률값으로 반환

print(pred)
print(len(pred))

[0.256      0.83205164 0.85276279 ... 0.91203688 0.234      0.63279572]
35452


In [169]:
submission = pd.read_csv('/content/gdrive/MyDrive/competition_data/sample_submission.csv')

submission

Unnamed: 0,index,nerdiness
0,0,-1
1,1,-1
2,2,-1
3,3,-1
4,4,-1
...,...,...
35447,35447,-1
35448,35448,-1
35449,35449,-1
35450,35450,-1


In [170]:
submission["nerdiness"] = pred
submission

Unnamed: 0,index,nerdiness
0,0,0.256000
1,1,0.832052
2,2,0.852763
3,3,0.607929
4,4,0.837600
...,...,...
35447,35447,0.850946
35448,35448,0.718724
35449,35449,0.912037
35450,35450,0.234000


In [171]:
submission.to_csv("baseline.csv", index = False)