# Project : Instagram Data Clustering

# 1. Data Introduce
### 1.1 Purpose : To find out popular hashtags are being used and analyze topic using Latent Dirichlet Allocation.

### 1.2 Data set:
   > - Train Data : 43264 rows, 20 columns

In [1]:
from IPython.display import display, Markdown
import private_function as pf
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle as pkl
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Pretty display for notebooks
%matplotlib inline

# Ignore the warnings
import warnings
warnings.filterwarnings('ignore')

def get_x_train_and_features_name(df):
    cnt_vectorizer = CountVectorizer(lowercase=False)
    X_train = cnt_vectorizer.fit_transform(df)
#     cnt_feature_names = cnt_vectorizer.get_feature_names()
    return X_train

def run_lda(df, n_topic, max_iter = 100):
    cnt_vectorizer = CountVectorizer(lowercase=False)
    X_train = cnt_vectorizer.fit_transform(df)
    cnt_feature_names = cnt_vectorizer.get_feature_names()

    # hyper parameter
    alpha = 1
    beta = 1

    # train the model
    lda = LatentDirichletAllocation(n_components=n_topic, doc_topic_prior=alpha,\
                                    topic_word_prior=beta, learning_method='online', max_iter=max_iter)

    %time lda.fit_transform(X_train)
    
    return lda, cnt_vectorizer, cnt_feature_names

def model_test(test_list, lda_model, cnt_vectorizer):
    test_ = cnt_vectorizer.transform(test_list)
    doc_topic_dist_unnormalized = np.matrix(lda_model.transform(test_))
    doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)
    print(doc_topic_dist.argmax(axis=1))
    return doc_topic_dist.argmax(axis=1)

def get_topic_using_lda(lda_model, x_train):
    doc_topic_dist_unnormalized = np.matrix(lda_model.transform(x_train))
    # normalize the distribution (only needed if you want to work with the probabilities)
    doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)
    return doc_topic_dist.argmax(axis=1)

df_train = pkl.load(open("asset/df_train.pkl", "rb"))

In [327]:
df_li = pf.getAllDataFrame()
train = pf.makeOneTrainDf(df_li)
train = pf.make_df_i_want(train)
counter = pf.get_counter(train)
counter.most_common()
tag_count = 0
for tup in counter.most_common():
    tag_count += tup[1]
tag_count, len(counter)

['직장인', '럽스타그램', '서울맛집', '먹스타그램', '친스타그램', '여행', '일상', '셀스타그램']

['seoul']

job.csv : 2213

insta_new_5.csv : 2136

insta_new_4.csv : 2148

insta_new_3.csv : 2172

insta_new_2.csv : 465

insta_new_1.csv : 2029

insta_new.csv : 2203

insta_train.csv : 5

insta_food_1.csv : 2217

insta_food.csv : 1911

food.csv : 1986

beer.csv : 1970

food_1.csv : 2068

food_2.csv : 2089

food_3.csv : 2046

friend.csv : 1888

trip_1.csv : 1145

trip.csv : 2533

daily.csv : 2195

daily_2.csv : 785

daily_1.csv : 2312

selfie.csv : 2156

seongnam.csv : 960

seoul_2.csv : 19

seoul_1.csv : 960

incheon.csv : 960

seoul.csv : 880

yongin.csv : 960

gyeonggido.csv : 960

29
Df list length : 29
Df length changes after concat (if 0 means all datas are unique) : 0
Df length changes after concat (if 0 means all datas are unique) : 0
Df length changes after concat (if 0 means all datas are unique) : 1095
Df length changes after concat (if 0 means all datas are unique) : 411
Df length changes after concat (if 0 means all datas are unique) : 54
Df length changes after concat (if 0 means all datas are unique) : 131
Df length changes after concat (if 0 means all datas are unique) : 0
Df length changes after concat (if 0 means all datas are unique) : 35
Df length changes after concat (if 0 means all datas are unique) : 16
Df length changes after concat (if 0 means all datas are unique) : 75
Df length changes after concat (if 0 means all datas are unique) : 11
Df length changes after concat (if 0 means all datas are unique) : 42
Df length changes after concat (if 0 means all datas are unique) : 358
Df length changes after concat (if 0 means all datas are unique) 

(46899, 14241)

In [328]:
X_test0 = train[train["tags_str"] == ""]["tags_str"].values
X_train = train[train["tags_str"] != ""]["tags_str"].values
len(X_train), len(X_test)

(38791, 2076)

In [8]:
lda_model12 = joblib.load("lda_model12.pkl")
feature_names_12 = joblib.load("lda_model12_feature_name.pkl")
lda_model13 = joblib.load("lda_model13.pkl")
feature_names_13 = joblib.load("lda_model13_feature_name.pkl")
lda_model14 = joblib.load("lda_model14.pkl")
feature_names_14 = joblib.load("lda_model14_feature_name.pkl")

## Topic별 Top 20 키워드

### Topic 0: 여행스타그램
여행에미치다 여행스타그램 여행 travel 휴가 korea photography 일본 photo seoul 바다 감성사진 한국 trip 풍경 웨딩 럽스타그램 스냅 예신 결혼

### Topic 1: 대출 광고
신용카드현금화 카드깡 상품권현금화 소액결제 소액결제현금화 모바일문화상품권 굿핀 해피머니 컬쳐랜드 신용카드대출 일수대출 모바 홍대맛집 휴대폰소액결제 차스타그램 핸드폰소액결제 빈티지 골프 상품권 비트코인

### Topic 2: 운동스타그램
다이어트 운동하는여자 다이어터 다이어트식단 용인 diet 다이어트그램 꽃다발 꽃스타그램 식단 헬스 flower 운동 운스타그램 fitness 플로리스트 유지어터 workout 운동하는남자 식단일기

### Topic 3: 신발 광고
축구 헬스 커플신발 푸마 레플 해외직구 아웃도어 커플운동화 나이키에어맥스 신발쇼핑몰 신상신발 등산화 신발도매 명품등산화 유행신발 아디다스신발 GGDB 아식스 아디다스울트라부스트 NEWBALANCE

### Topic 4: Kpop
kpop bts exo 귀여운 blackpink 여자 korea kawaii 아름다운 自撮り boy asmr korean 속초맛집 Asia jisoo kpopl4l tomboy селфи 自分撮り

### Topic 5: 서울 맛집
서울맛집 부산맛집 강남맛집 대구맛집 곱창 홍대맛집 강남역 강남역맛집 대전맛집 강남 맛스타그램 역삼맛집 이태원맛집 역삼동맛집 신논현맛집 대치동맛집 서면맛집 곱창맛집 역삼동 서초맛집

### Topic 6: 럽스타그램
남친이랑 데이트 커플 사랑해 럽스타 커플스타그램 남자친구 영화 연애중 연애 행복 남친 고마워 사랑꾼 ㅋㅋㅋ 행복해 데이트그램 사랑 여자친구 화이팅

### Topic 7: 육아스타그램
럽스타그램 육아 육아스타그램 육아소통 육아맘 맘스타그램 도치맘 일상 젊줌마 딸스타그램 인스타베이비 애스타그램 직장인 아들스타그램 사랑해 아들맘 세젤귀 반려견 워킹맘 멍스타그램

### Topic 8: 토토 등 사이트 및 명품 광고
토토사이트추천 겐조 샤넬 서울맛집 운동하는남자 럽스타 핫플레이스 커플 구찌 에르메스 술스타그램 토토사이트 발렌티노 팔찌 스포츠가족방 사설사이트추천 파워볼가족방 프로토가족방 픽스터가족방 사설사이트

### Topic 9: 폰케이스 광고
선물 직장인 아이폰케이스 냥스타그램 마카롱 취미 고양이 폰케이스 아이폰8 커플케이스 공감 그림 글스타그램 기념일 디저트 일러스트 아이폰7케이스 아이폰x케이스 글귀 귀걸이

### Topic 10: 먹스타그램
먹스타그램 먹방 맛스타그램 맛집 맥주스타그램 food 술스타그램 점심 먹스타 맥주 instafood foodstagram 먹방스타그램 음식 존맛 맛스타 먹부림 푸드스타그램 맛있다 디저트

### Topic 11: 일상적인 해시태그
일상 맞팔 데일리 소통 선팔 셀스타그램 좋아요 셀카 셀피 팔로우 먹스타그램 daily 얼스타그램 일상스타그램 선팔하면맞팔 오오티디 럽스타그램 인친 좋아요반사 ootd

### Topic 12: 제주도 맛집 광고
제주도 제주도맛집 제주맛집 제주 서귀포맛집 제주여행 f4follow 선팔맞팔 제주맛집추천 소통해요 소통하자 제주도그램 제주서귀포맛집 맥주스타그램 제주도여행 제주도흑돼지맛집 제주흑돼지 제주도흑돼지 서귀포흑돼지맛집 서귀포흑돼지

### Topic 13: 거제도 맛집?
거제도맛집 거제맛집 포천여행 짖어야개다 평일 포천카페 햄스터 포천이동갈비 삼척맛집 성남애견미용 근무시간 스킨케어 포천맛집 거제대명리조트맛집 성남애견호텔 골든햄스터 살롱순라 햄스타그램 2018년 삼척여행

### Topic 14: 반려견스타그램 (lda topic number = 12를 통해서 찾음, 해당 모델에선 topic 8)
반려견 멍스타그램 강아지 개스타그램 냥스타그램 아이폰케이스 dog 댕댕이 고양이 펫스타그램 폰케이스 견스타그램 아이폰8 커플케이스 cat dogstagram 아이폰7케이스 아이폰x케이스 puppy 독스타그램

### Topic 15: 맥주스타그램 및 치맥 (lda topic number = 13을 통해서 찾음, 해당 모델에선 topic 6)
맥주스타그램 술스타그램 맥주 beer 혼술 치맥 수제맥주 소주 소맥 치킨 맥주한잔 술집 beerstagram 크래프트비어 맥주그램 존맛탱 낮술 생맥주 술스타 craftbeer

### 1, 3, 8, 9, 12는 다 광고성 글이다

In [22]:
tmp_li = [1, 3, 8, 9, 12]
count = 0
for topic in tmp_li:
    length = len(df_train[df_train.target == topic])
    count += length
    print(topic, length)
count

1 570
3 847
8 1451
9 1114
12 529


4511

In [26]:
df_train.columns

Index(['caption', 'comment_cnt', 'first_comment', 'id', 'is_video', 'likes',
       'loc_id', 'loc_lat', 'loc_lon', 'loc_name', 'owner_id', 'owner_name',
       'shortcode', 'taken_at_timestamp', 'video_view_count', 'tags',
       'tags_cnt', 'caption_only', 'tags_str', 'duplicated_tag', 'topic_type',
       'topic_type_13', 'topic_type_12', 'target'],
      dtype='object')

In [27]:
columns = ['caption_only', 'tags_str', 'duplicated_tag', 'topic_type', 'topic_type_13', 'topic_type_12', 'target']

In [367]:
train = df_train[columns]

In [368]:
train["caption_only"] = train["caption_only"].apply(lambda a: "" if type(a) == float else a)

In [369]:
train["caption_only"] = train["caption_only"].apply(lambda a: a.strip())

In [371]:
train = train[train["caption_only"] != ""]

In [357]:
X_test0 = train[train["tags_str"] == ""]

In [347]:
test = X_test0.caption

In [226]:
train_ = train[train.target != 11]

In [377]:
clf = Pipeline([ 
    ('vect', TfidfVectorizer(min_df=2)), 
    ('clf', MultinomialNB(alpha=0.01)) 
])

In [378]:
X_train, X_test, y_train, y_test = train_test_split(train.caption_only, train.topic_type, test_size=0.1, random_state=1)
len(X_train), len(X_test), len(y_train), len(y_test)

(32796, 3645, 32796, 3645)

In [379]:
%%time
model = clf.fit(X_train, y_train)

CPU times: user 1.22 s, sys: 173 ms, total: 1.4 s
Wall time: 1.49 s


In [348]:
test = test.dropna()

In [374]:
y_pred_test = model.predict(test)

In [355]:
X_test0 = X_test0[X_test0.caption != ""]

In [353]:
len(y_pred_test)

3861

In [362]:
X_test0 = X_test0[~X_test0.caption.isnull()]

In [375]:
X_test0["target"] = y_pred_test

In [376]:
X_test0[["caption", "target"]]

Unnamed: 0,caption,target
6,안늉?,11
10,금요일 ㅠ ㅠ !,11
31,♡ B͞͞l͞͞u͞͞e͞͞ O͞͞c͞͞e͞͞a͞͞n͞͞ ♡\n.\n여름이라 그런지 ...,9
34,"👉🏻옆으로 넘겨서 봐주세요!\n.\n✨8월 31일까지 피어싱,주얼리 10% 할인\n...",11
74,♡ S͞͞a͞͞t͞͞e͞͞l͞͞l͞͞i͞͞t͞͞e͞͞ ♡ ❤️ 신상 ❤️\n.\n미...,11
76,♡ K͞͞e͞͞y͞͞ o͞͞f͞͞ H͞͞e͞͞a͞͞r͞͞t͞͞ ♡\n.\n키 오브 ...,9
77,♡ M͞͞e͞͞l͞͞o͞͞d͞͞y͞͞ ♡ 💕 신상 💕\n.\n사랑스러운 자개하트와 ...,11
79,내새꾸💚,7
81,즐퇴~,5
87,휴무전날이라 기분이가 쌍코매💗,10


In [267]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.53      0.38      0.44       198
          1       0.92      0.72      0.81        67
          2       0.67      0.22      0.33        55
          3       0.26      0.08      0.12        75
          4       0.62      0.36      0.45        28
          5       0.77      0.62      0.68       143
          6       0.29      0.16      0.21       174
          7       0.51      0.48      0.49       343
          8       0.92      0.93      0.92       131
          9       0.68      0.42      0.52       153
         10       0.49      0.57      0.52       573
         11       0.66      0.78      0.71      1632
         12       0.90      0.60      0.72        58
         13       0.88      0.47      0.61        15

avg / total       0.61      0.62      0.60      3645



In [268]:
accuracy_score(y_test, y_pred) 

0.6170096021947874

In [128]:
def get_lgbm_dataset(X_train, X_test, y_train, y_test, is_predict = False, is_countvec = False):
    if is_countvec:
        vectorizer = CountVectorizer(lowercase=False)            
    else:
        vectorizer = TfidfVectorizer(lowercase=False)
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    X_train = X_train.astype(float)
    X_test = X_test.astype(float)
    if is_predict:
        return X_train, X_test
    dtrain = lightgbm.Dataset(X_train, label=y_train)
    dtest = lightgbm.Dataset(X_test, label=y_test)
    return dtrain, dtest

In [129]:
def run_light_gbm(X_train, X_test, y_train, y_test, is_countvec=False):
    dtrain, dtest = get_lgbm_dataset(X_train, X_test, y_train, y_test, is_predict=False, is_countvec=is_countvec)
    num_boost_round = 4000
    learning_rate=0.02

    params = {'objective':'multiclass',
              'boosting_type': 'gbdt',
              'max_depth' : -1,
              'nthread': 4,
              'metric': 'multi_logloss',
              'num_class':16,
              'learning_rate':learning_rate,
              }
    
    lightgbm_model = lightgbm.train(params = params,
                                    train_set = dtrain, 
                                    valid_sets = [dtrain, dtest],
                                    num_boost_round = num_boost_round,
                                    early_stopping_rounds=10)
    
    return lightgbm_model

In [130]:
lgbm = run_light_gbm(X_train, X_test, y_train, y_test)

[1]	training's multi_logloss: 2.71779	valid_1's multi_logloss: 2.71701
Training until validation scores don't improve for 10 rounds.
[2]	training's multi_logloss: 2.66859	valid_1's multi_logloss: 2.66723
[3]	training's multi_logloss: 2.62377	valid_1's multi_logloss: 2.62193
[4]	training's multi_logloss: 2.58234	valid_1's multi_logloss: 2.58007
[5]	training's multi_logloss: 2.54408	valid_1's multi_logloss: 2.54155
[6]	training's multi_logloss: 2.50833	valid_1's multi_logloss: 2.50574
[7]	training's multi_logloss: 2.4748	valid_1's multi_logloss: 2.47221
[8]	training's multi_logloss: 2.44325	valid_1's multi_logloss: 2.44047


KeyboardInterrupt: 

In [131]:
lgbm_cnt = run_light_gbm(X_train, X_test, y_train, y_test, is_countvec=True)

[1]	training's multi_logloss: 2.71932	valid_1's multi_logloss: 2.71802
Training until validation scores don't improve for 10 rounds.
[2]	training's multi_logloss: 2.67141	valid_1's multi_logloss: 2.66908
[3]	training's multi_logloss: 2.62776	valid_1's multi_logloss: 2.62461
[4]	training's multi_logloss: 2.58755	valid_1's multi_logloss: 2.5838
[5]	training's multi_logloss: 2.5502	valid_1's multi_logloss: 2.54609
[6]	training's multi_logloss: 2.51536	valid_1's multi_logloss: 2.51091
[7]	training's multi_logloss: 2.48262	valid_1's multi_logloss: 2.47771
[8]	training's multi_logloss: 2.45185	valid_1's multi_logloss: 2.44656
[9]	training's multi_logloss: 2.42275	valid_1's multi_logloss: 2.41704
[10]	training's multi_logloss: 2.3951	valid_1's multi_logloss: 2.38913
[11]	training's multi_logloss: 2.3689	valid_1's multi_logloss: 2.36308
[12]	training's multi_logloss: 2.344	valid_1's multi_logloss: 2.33812
[13]	training's multi_logloss: 2.32014	valid_1's multi_logloss: 2.31394
[14]	training's m

[115]	training's multi_logloss: 1.52757	valid_1's multi_logloss: 1.54461
[116]	training's multi_logloss: 1.52491	valid_1's multi_logloss: 1.54232
[117]	training's multi_logloss: 1.5223	valid_1's multi_logloss: 1.54006
[118]	training's multi_logloss: 1.51973	valid_1's multi_logloss: 1.53784
[119]	training's multi_logloss: 1.51721	valid_1's multi_logloss: 1.53562
[120]	training's multi_logloss: 1.51468	valid_1's multi_logloss: 1.53348
[121]	training's multi_logloss: 1.51223	valid_1's multi_logloss: 1.53143
[122]	training's multi_logloss: 1.50981	valid_1's multi_logloss: 1.52935
[123]	training's multi_logloss: 1.50739	valid_1's multi_logloss: 1.52729
[124]	training's multi_logloss: 1.505	valid_1's multi_logloss: 1.52522
[125]	training's multi_logloss: 1.50265	valid_1's multi_logloss: 1.52329
[126]	training's multi_logloss: 1.50034	valid_1's multi_logloss: 1.52131
[127]	training's multi_logloss: 1.49804	valid_1's multi_logloss: 1.51939
[128]	training's multi_logloss: 1.49578	valid_1's mult

[228]	training's multi_logloss: 1.35505	valid_1's multi_logloss: 1.41296
[229]	training's multi_logloss: 1.35416	valid_1's multi_logloss: 1.41242
[230]	training's multi_logloss: 1.35329	valid_1's multi_logloss: 1.41192
[231]	training's multi_logloss: 1.35238	valid_1's multi_logloss: 1.41128
[232]	training's multi_logloss: 1.35151	valid_1's multi_logloss: 1.41083
[233]	training's multi_logloss: 1.35064	valid_1's multi_logloss: 1.41026
[234]	training's multi_logloss: 1.34977	valid_1's multi_logloss: 1.40978
[235]	training's multi_logloss: 1.3489	valid_1's multi_logloss: 1.40927
[236]	training's multi_logloss: 1.34806	valid_1's multi_logloss: 1.40876
[237]	training's multi_logloss: 1.34718	valid_1's multi_logloss: 1.40825
[238]	training's multi_logloss: 1.34632	valid_1's multi_logloss: 1.40771
[239]	training's multi_logloss: 1.34549	valid_1's multi_logloss: 1.40726
[240]	training's multi_logloss: 1.34464	valid_1's multi_logloss: 1.4068
[241]	training's multi_logloss: 1.34381	valid_1's mul

[341]	training's multi_logloss: 1.27791	valid_1's multi_logloss: 1.37643
[342]	training's multi_logloss: 1.27737	valid_1's multi_logloss: 1.3762
[343]	training's multi_logloss: 1.27686	valid_1's multi_logloss: 1.376
[344]	training's multi_logloss: 1.27635	valid_1's multi_logloss: 1.3759
[345]	training's multi_logloss: 1.27582	valid_1's multi_logloss: 1.37571
[346]	training's multi_logloss: 1.27532	valid_1's multi_logloss: 1.37551
[347]	training's multi_logloss: 1.27478	valid_1's multi_logloss: 1.37535
[348]	training's multi_logloss: 1.27427	valid_1's multi_logloss: 1.37514
[349]	training's multi_logloss: 1.27375	valid_1's multi_logloss: 1.37498
[350]	training's multi_logloss: 1.27323	valid_1's multi_logloss: 1.3748
[351]	training's multi_logloss: 1.27273	valid_1's multi_logloss: 1.37468
[352]	training's multi_logloss: 1.27223	valid_1's multi_logloss: 1.37444
[353]	training's multi_logloss: 1.27173	valid_1's multi_logloss: 1.37428
[354]	training's multi_logloss: 1.27121	valid_1's multi_

[454]	training's multi_logloss: 1.22789	valid_1's multi_logloss: 1.36181
[455]	training's multi_logloss: 1.22751	valid_1's multi_logloss: 1.36172
[456]	training's multi_logloss: 1.22713	valid_1's multi_logloss: 1.36154
[457]	training's multi_logloss: 1.22677	valid_1's multi_logloss: 1.36146
[458]	training's multi_logloss: 1.22639	valid_1's multi_logloss: 1.36136
[459]	training's multi_logloss: 1.22603	valid_1's multi_logloss: 1.36125
[460]	training's multi_logloss: 1.22566	valid_1's multi_logloss: 1.36117
[461]	training's multi_logloss: 1.22529	valid_1's multi_logloss: 1.36106
[462]	training's multi_logloss: 1.22491	valid_1's multi_logloss: 1.36099
[463]	training's multi_logloss: 1.22454	valid_1's multi_logloss: 1.36093
[464]	training's multi_logloss: 1.22417	valid_1's multi_logloss: 1.36086
[465]	training's multi_logloss: 1.2238	valid_1's multi_logloss: 1.36074
[466]	training's multi_logloss: 1.22343	valid_1's multi_logloss: 1.3607
[467]	training's multi_logloss: 1.22307	valid_1's mul

[567]	training's multi_logloss: 1.19045	valid_1's multi_logloss: 1.35504
[568]	training's multi_logloss: 1.19017	valid_1's multi_logloss: 1.35501
[569]	training's multi_logloss: 1.18988	valid_1's multi_logloss: 1.35495
[570]	training's multi_logloss: 1.18959	valid_1's multi_logloss: 1.35492
[571]	training's multi_logloss: 1.1893	valid_1's multi_logloss: 1.35486
[572]	training's multi_logloss: 1.18901	valid_1's multi_logloss: 1.35482
[573]	training's multi_logloss: 1.18872	valid_1's multi_logloss: 1.35477
[574]	training's multi_logloss: 1.18842	valid_1's multi_logloss: 1.35471
[575]	training's multi_logloss: 1.18813	valid_1's multi_logloss: 1.35473
[576]	training's multi_logloss: 1.18785	valid_1's multi_logloss: 1.35472
[577]	training's multi_logloss: 1.18755	valid_1's multi_logloss: 1.35472
[578]	training's multi_logloss: 1.18726	valid_1's multi_logloss: 1.35473
[579]	training's multi_logloss: 1.18697	valid_1's multi_logloss: 1.35467
[580]	training's multi_logloss: 1.18669	valid_1's mu

[680]	training's multi_logloss: 1.16038	valid_1's multi_logloss: 1.35237
[681]	training's multi_logloss: 1.16013	valid_1's multi_logloss: 1.35237
[682]	training's multi_logloss: 1.15989	valid_1's multi_logloss: 1.35233
[683]	training's multi_logloss: 1.15965	valid_1's multi_logloss: 1.3523
[684]	training's multi_logloss: 1.15941	valid_1's multi_logloss: 1.35232
[685]	training's multi_logloss: 1.15916	valid_1's multi_logloss: 1.35229
[686]	training's multi_logloss: 1.15893	valid_1's multi_logloss: 1.35228
[687]	training's multi_logloss: 1.15869	valid_1's multi_logloss: 1.35225
[688]	training's multi_logloss: 1.15846	valid_1's multi_logloss: 1.35228
[689]	training's multi_logloss: 1.15821	valid_1's multi_logloss: 1.35224
[690]	training's multi_logloss: 1.15798	valid_1's multi_logloss: 1.35222
[691]	training's multi_logloss: 1.15774	valid_1's multi_logloss: 1.35222
[692]	training's multi_logloss: 1.1575	valid_1's multi_logloss: 1.35223
[693]	training's multi_logloss: 1.15726	valid_1's mul

[793]	training's multi_logloss: 1.13509	valid_1's multi_logloss: 1.35096
Early stopping, best iteration is:
[783]	training's multi_logloss: 1.13719	valid_1's multi_logloss: 1.35089


In [115]:
dtrain, dtest = get_lgbm_dataset(X_train, X_test, y_train, y_test, True)

In [203]:
y_pred_lgbm =lgbm.predict(dtest)

In [204]:
y_pred_lgbm = y_pred_lgbm.argmax(axis=1)

In [205]:
accuracy_score(y_test, y_pred_lgbm)

0.6131687242798354

In [206]:
print(classification_report(y_test, y_pred_lgbm))

             precision    recall  f1-score   support

          0       0.65      0.33      0.44       198
          1       0.88      0.65      0.75        66
          2       0.48      0.18      0.26        55
          3       0.50      0.07      0.12        74
          4       1.00      0.12      0.21        26
          5       0.81      0.61      0.69       143
          6       0.50      0.12      0.20       163
          7       0.61      0.32      0.42       291
          8       0.96      0.93      0.95       131
          9       0.68      0.38      0.49       118
         10       0.52      0.31      0.39       449
         11       0.58      0.92      0.71      1630
         12       0.89      0.59      0.71        58
         13       1.00      0.50      0.67        10
         14       0.80      0.25      0.38        96
         15       0.63      0.31      0.42       137

avg / total       0.62      0.61      0.57      3645



In [132]:
y_pred_lgbm_cnt =lgbm_cnt.predict(dtest)

In [133]:
y_pred_lgbm_cnt = y_pred_lgbm_cnt.argmax(axis=1)

In [134]:
accuracy_score(y_test, y_pred_lgbm_cnt)

0.6008230452674898

In [135]:
print(classification_report(y_test, y_pred_lgbm_cnt))

             precision    recall  f1-score   support

          0       0.62      0.32      0.42       198
          1       0.86      0.65      0.74        66
          2       0.50      0.18      0.27        55
          3       0.56      0.07      0.12        74
          4       0.75      0.12      0.20        26
          5       0.84      0.55      0.67       143
          6       0.53      0.12      0.19       163
          7       0.58      0.32      0.41       291
          8       0.85      0.93      0.89       131
          9       0.69      0.31      0.42       118
         10       0.52      0.30      0.38       449
         11       0.58      0.92      0.71      1630
         12       0.97      0.57      0.72        58
         13       1.00      0.50      0.67        10
         14       0.65      0.18      0.28        96
         15       0.56      0.26      0.36       137

avg / total       0.61      0.60      0.55      3645



In [183]:
from sklearn.ensemble import ExtraTreesClassifier
n_estimators = 100
random_tree_model = ExtraTreesClassifier(max_depth=5, n_estimators=n_estimators)

In [184]:
random_tree_model.fit(dtrain, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=5, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [185]:
y_pred = random_tree_model.predict(dtest)

In [186]:
accuracy_score(y_test, y_pred)

0.4474622770919067

In [187]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       198
          1       0.00      0.00      0.00        66
          2       0.00      0.00      0.00        55
          3       0.00      0.00      0.00        74
          4       0.00      0.00      0.00        26
          5       1.00      0.01      0.01       143
          6       0.00      0.00      0.00       163
          7       0.00      0.00      0.00       291
          8       0.00      0.00      0.00       131
          9       0.00      0.00      0.00       118
         10       0.00      0.00      0.00       449
         11       0.45      1.00      0.62      1630
         12       0.00      0.00      0.00        58
         13       0.00      0.00      0.00        10
         14       0.00      0.00      0.00        96
         15       0.00      0.00      0.00       137

avg / total       0.24      0.45      0.28      3645



In [None]:
pd.read_excel()