In [1]:
from pycaret.classification import *
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
import collections

warnings.filterwarnings(action='ignore')

PATH = '../data/'
def make_days(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    return (dt.date(year, month, day) - dt.date(2020, 10, 31)).days

def make_hours(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    return int(x[8:10])

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^\-0-9\.]+", '', str(x))
    if x =='':
        return -1
    else:
        return int(float(x))

In [2]:
train_err = pd.read_csv(PATH+'train_err_data.csv')
train_err['days'] = train_err['time'].apply(make_days)
train_err['errcode'] = train_err['errcode'].astype(str)

test_err = pd.read_csv(PATH+'test_err_data.csv')
test_err['days'] = test_err['time'].apply(make_days)
test_err['errcode'] = test_err['errcode'].astype(str)

In [3]:
train_prob = pd.read_csv(PATH+'train_problem_data.csv')
problem = np.zeros(15000)
problem[train_prob.user_id.unique()-10000] = 1 

# train_err에만 있는 fwver

In [4]:
a = train_err['fwver'].unique()
a.sort()
a

array(['03.11.1141', '03.11.1149', '03.11.1167', '04.16.2641',
       '04.16.3345', '04.16.3439', '04.16.3553', '04.16.3569',
       '04.16.3571', '04.22.1442', '04.22.1656', '04.22.1666',
       '04.22.1684', '04.22.1750', '04.22.1778', '04.33.1095',
       '04.33.1125', '04.33.1149', '04.33.1171', '04.33.1185',
       '04.33.1261', '04.73.2237', '04.73.2571', '04.82.1684',
       '04.82.1730', '04.82.1778', '05.15.2090', '05.15.2092',
       '05.15.2114', '05.15.2120', '05.15.2122', '05.15.2138',
       '05.15.3104', '05.66.3237', '05.66.3571', '10', '8.5.3'],
      dtype=object)

# test_err에만 있는 fwver

In [5]:
b = test_err['fwver'].unique()
b.sort()
b

array(['03.11.1141', '03.11.1149', '03.11.1167', '04.16.3439',
       '04.16.3553', '04.16.3569', '04.16.3571', '04.22.1170',
       '04.22.1448', '04.22.1478', '04.22.1608', '04.22.1656',
       '04.22.1666', '04.22.1684', '04.22.1750', '04.22.1772',
       '04.22.1778', '04.33.1125', '04.33.1149', '04.33.1171',
       '04.33.1185', '04.33.1261', '04.73.2237', '04.73.2569',
       '04.73.2571', '04.73.2577', '04.82.1684', '04.82.1730',
       '04.82.1778', '05.15.2092', '05.15.2114', '05.15.2120',
       '05.15.2138', '05.15.3104', '05.66.3237', '05.66.3571', '10',
       '10.22.1770', '10.22.1780', '8.5.3'], dtype=object)

# 공통 fwver

In [12]:
c = list(set(a) & set(b))
c.sort()
np.array(c)

array(['03.11.1141', '03.11.1149', '03.11.1167', '04.16.3439',
       '04.16.3553', '04.16.3569', '04.16.3571', '04.22.1656',
       '04.22.1666', '04.22.1684', '04.22.1750', '04.22.1778',
       '04.33.1125', '04.33.1149', '04.33.1171', '04.33.1185',
       '04.33.1261', '04.73.2237', '04.73.2571', '04.82.1684',
       '04.82.1730', '04.82.1778', '05.15.2092', '05.15.2114',
       '05.15.2120', '05.15.2138', '05.15.3104', '05.66.3237',
       '05.66.3571', '10', '8.5.3'], dtype='<U10')

# 펌웨어에 따라서 가지고 있는 에러 타입 종류

In [13]:
q = []
for i, group in train_err.groupby('fwver'):
    users = group['user_id'].unique()
    _p = 0
    _np = 0
    for j in users:
        if problem[j-10000]:
            _p += 1
        else:
            _np += 1
    q.append([i, len(users), len(test_err.loc[test_err['fwver']==i, 'user_id'].unique()) ,_p/(_p+_np), sorted(group['errtype'].unique())])
q = pd.DataFrame(q, columns=['fwver', 'count','test count','p','errtypes'])
q.sort_values('count', ascending=False)

Unnamed: 0,fwver,count,test count,p,errtypes
20,04.33.1261,4447,4323,0.39105,"[1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
13,04.22.1750,4331,4384,0.376356,"[1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
14,04.22.1778,4174,4263,0.379971,"[1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
19,04.33.1185,3436,3396,0.328289,"[1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
31,05.15.2138,3099,3077,0.232333,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
6,04.16.3553,2837,2884,0.463165,"[1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
2,03.11.1167,683,687,0.206442,"[1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
8,04.16.3571,502,559,0.587649,"[2, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 1..."
17,04.33.1149,160,147,0.7,"[1, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 1..."
33,05.66.3237,61,56,0.52459,"[3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16,..."


In [10]:
fwver_used_id = {}
table = {}
for i, group in train_err.groupby('user_id'):
    fwver_used_id[i] = list(group['fwver'].drop_duplicates().values)
    for j in range(len(fwver_used_id[i]), 5):
        fwver_used_id[i].append('not updated')
        
    if tuple(fwver_used_id[i]) not in table:
        table[tuple(fwver_used_id[i])] = [0, 0, 0]
    if problem[i-10000]:
        table[tuple(fwver_used_id[i])][0] += 1
    else:
        table[tuple(fwver_used_id[i])][1] += 1
        
for i, group in test_err.groupby('user_id'):
    fwver_used_id[i] = list(group['fwver'].drop_duplicates().values)
    for j in range(len(fwver_used_id[i]), 5):
        fwver_used_id[i].append('not updated')
    if tuple(fwver_used_id[i]) not in table:
        table[tuple(fwver_used_id[i])] = [0, 0, 0]
    
    table[tuple(fwver_used_id[i])][2] += 1

In [11]:
result = []
for i in table:
    p = 0
    if table[i][0]+table[i][1] != 0:
        p = table[i][0] / (table[i][0]+table[i][1])
        
    row = [] + list(i) + [table[i][0], table[i][1], table[i][0]+ table[i][1], table[i][2], p]
    result.append(row)
result = pd.DataFrame(result, columns=[str(i) for i in range(0,5)] + ['p', 'np', 'testc', 'testc', 'problem'])
result.sort_values('problem', ascending=False)

Unnamed: 0,0,1,2,3,4,p,np,testc,testc.1,problem
76,04.16.2641,04.33.1261,not updated,not updated,not updated,1,0,1,0,1.0
31,04.22.1750,04.22.1684,04.22.1778,not updated,not updated,5,0,5,11,1.0
89,04.73.2237,04.33.1149,04.33.1185,04.33.1261,not updated,1,0,1,0,1.0
22,04.33.1261,04.33.1149,not updated,not updated,not updated,1,0,1,0,1.0
85,04.16.3553,04.33.1125,04.33.1261,not updated,not updated,1,0,1,1,1.0
50,05.15.2138,04.33.1125,04.33.1261,not updated,not updated,1,0,1,0,1.0
27,04.82.1730,04.82.1778,not updated,not updated,not updated,1,0,1,3,1.0
47,05.66.3237,05.66.3571,04.33.1261,not updated,not updated,3,0,3,1,1.0
29,04.33.1185,04.33.1261,04.33.1149,not updated,not updated,1,0,1,3,1.0
32,04.16.3553,04.16.3571,04.33.1149,04.33.1261,not updated,21,0,21,17,1.0
