In [1]:
import pandas as pd

from scipy.stats import entropy
from collections import Counter

# Generating for Train Dataset

In [2]:
train = pd.read_csv('../Dataset/Train/Temp/lx_train.csv')
train = train.fillna('')

In [3]:
################################################################

def get_val_1(s):
    total = len(s) 
    if total == 0:
        return 0     
    
    s = sorted(Counter(s), key=Counter(s).get, reverse=True)
    
    count = 0 
    for c in s:
        count += s.count(c)
        if count/total > 0.5:
            return count
    return count

################################################################

def get_val_2(s):
    total = len(s) 
    if total == 0:
        return 0    
    
    s = sorted(Counter(s), key=Counter(s).get, reverse=True)

    count = 0   
    for c in s[:5]:
        count += s.count(c)
    return count/total

################################################################

def get_val_3(s):
    total = len(s) 
    if total == 0:
        return ''     
    
    s = sorted(Counter(s), key=Counter(s).get, reverse=True)
    return s[0]

################################################################
################################################################


from itertools import groupby

def dig_seq_l(s):
    if len(s) == 0:
        return 0
    
    res = [''.join(j).strip() for k, j in groupby(s, str.isdigit)]
    res = [sub for sub in res if sub.isdigit()]
    
    if len(res) == 0:
        return 0
    
    res = max(res, key = len)
    return len(res)


################################################################

def chr_seq_l(s):
    if len(s) == 0:
        return 0
    
    res = [''.join(g) for _, g in groupby(s)]
    res = max(res, key = len)
    return len(res)

################################################################

def chr_seq_c(s):
    if len(s) == 0:
        return ''    
    
    res = [''.join(g) for _, g in groupby(s)]
    res = max(res, key = len)
    return res[0]

################################################################

In [4]:
train['SSD_val_1'] = train['SSD'].apply(get_val_1)
train['SUB_val_1'] = train['SUB'].apply(get_val_1)
train['SLD_val_1'] = train['SLD'].apply(get_val_1)

train['SSD_val_2'] = train['SSD'].apply(get_val_2)
train['SUB_val_2'] = train['SUB'].apply(get_val_2)
train['SLD_val_2'] = train['SLD'].apply(get_val_2)

train['SSD_val_3'] = train['SSD'].apply(get_val_3)
train['SUB_val_3'] = train['SUB'].apply(get_val_3)
train['SLD_val_3'] = train['SLD'].apply(get_val_3)

train['SSD_dig_seq_l'] = train['SSD'].apply(dig_seq_l)
train['SUB_dig_seq_l'] = train['SUB'].apply(dig_seq_l)
train['SLD_dig_seq_l'] = train['SLD'].apply(dig_seq_l)

train['SSD_chr_seq_l'] = train['SSD'].apply(chr_seq_l)
train['SUB_chr_seq_l'] = train['SUB'].apply(chr_seq_l)
train['SLD_chr_seq_l'] = train['SLD'].apply(chr_seq_l)

train['SSD_chr_seq_c'] = train['SSD'].apply(chr_seq_c)
train['SUB_chr_seq_c'] = train['SUB'].apply(chr_seq_c)
train['SLD_chr_seq_c'] = train['SLD'].apply(chr_seq_c)

In [5]:
# Saving the dataset

train.to_csv('../Dataset/Train/Temp/lx_train.csv', index=False)

# Generating for Test Dataset

In [7]:
test = pd.read_csv('../Dataset/Test/Temp/lx_test.csv')
test = test.fillna('')

In [8]:
test['SSD_val_1'] = test['SSD'].apply(get_val_1)
test['SUB_val_1'] = test['SUB'].apply(get_val_1)
test['SLD_val_1'] = test['SLD'].apply(get_val_1)

test['SSD_val_2'] = test['SSD'].apply(get_val_2)
test['SUB_val_2'] = test['SUB'].apply(get_val_2)
test['SLD_val_2'] = test['SLD'].apply(get_val_2)

test['SSD_val_3'] = test['SSD'].apply(get_val_3)
test['SUB_val_3'] = test['SUB'].apply(get_val_3)
test['SLD_val_3'] = test['SLD'].apply(get_val_3)

test['SSD_dig_seq_l'] = test['SSD'].apply(dig_seq_l)
test['SUB_dig_seq_l'] = test['SUB'].apply(dig_seq_l)
test['SLD_dig_seq_l'] = test['SLD'].apply(dig_seq_l)

test['SSD_chr_seq_l'] = test['SSD'].apply(chr_seq_l)
test['SUB_chr_seq_l'] = test['SUB'].apply(chr_seq_l)
test['SLD_chr_seq_l'] = test['SLD'].apply(chr_seq_l)

test['SSD_chr_seq_c'] = test['SSD'].apply(chr_seq_c)
test['SUB_chr_seq_c'] = test['SUB'].apply(chr_seq_c)
test['SLD_chr_seq_c'] = test['SLD'].apply(chr_seq_c)

In [9]:
# Saving the dataset

test.to_csv('../Dataset/Test/Temp/lx_test.csv', index=False)