In [1]:
import re
import linecache

RE_DELIMITER = r'\t'
RE_DATE_TIME = r'(?=\d{2}(?:\d{2})?-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2})'
RE_DATE_TIME_AFTER = r'\"(?=\d{2}(?:\d{2})?-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2})\"'

In [2]:
def split_line(line: str, re_expression = RE_DELIMITER) -> []:
    pattern = re.compile(re_expression)
    return pattern.split(line.strip('\n'))


def get_header(filename, split = True) -> []:
    return get_line(filename, 0, split)


def column_names(filename):
    pass


def _quote_date_time(line) -> str:
    return re.sub(RE_DATE_TIME, RE_DATE_TIME_AFTER, line)
    
    
def get_line(filename, line_index:int = 0, split = True):
    if line_index < 0:
        return [] if split else ""
    f = open(filename, "r")
    cur_line_index = 0
    while cur_line_index < line_index:
        cur_line_index += 1
        f.readline()
    line = f.readline()
    f.close()
    #print("line content: \n{}".format(line))
    #line = linecache.getline(filename, line_index+1)
    return split_line(line) if split else line
 
    
def get_column_index(filename, col_name) -> int:
    header = get_header(filename)
    index = -1
    try:
        index = header.index(col_name)
    except:
        index = -1
    return index
    
    
def _check_col_index_or_name(col_index_or_name = None) -> int:
    if col_index_or_name is None:
        raise Error("should provide either index or name of a column")
    if type(col_index_or_name) not in [int, str]:
        raise Error("column info should be type of either integer or string")
    if type(col_index_or_name) in [str]:
        col_index = get_column_index(filename, col_index_or_name)
    else:
        col_index = col_index_or_name
    return col_index
    
def get_column(filename, col_index_or_name = None, include_header = False) -> []:
    
    col_index = _check_col_index_or_name(col_index_or_name)

    max_col_index = get_col_count(filename) - 1
    if col_index < 0 or col_index > max_col_index:
        raise Error("can not find that column")
    
    f = open(filename, "r")
    cols = []
    
    if not include_header:
        f.readline()
        
    while True:
        line = f.readline()
        if line == "": # EOF
            break
        else:
            contents = split_line(line)
            cols.append(contents[col_index])
    f.close()
    return cols


def get_column_fast(filename, col_index_or_name = None, include_header = False) -> []:
    col_index = _check_col_index_or_name(col_index_or_name)
    # TODO: add more data
    pass

def get_freq(contents: []):
    freq_dict = {}
    for c in contents:
        if c in freq_dict:
            freq_dict[c] += 1
        else:
            freq_dict[c] = 1
            
    return freq_dict
    
    
def get_info(content):
    if type(content) in [str]:
        columns = split_line(content, RE_DELIMITER)
    else:
        columns = content
    info = []
    info.append(("total count", len(columns)))
    min_len_column, max_len_column = "", ""
    min_len_index, max_len_index = None, None
    min_len, max_len = 999, 0
    for i, column in enumerate(columns):
        l = len(column)
        if l < min_len:
            min_len = l
            min_len_index = i
            min_len_column = column
        elif l > max_len:
            max_len = l
            max_len_column = column
            max_len_index = i            
                
    freq = get_freq(columns)        
    info.append(("different values", len(freq.keys())))
    info.append(("min len index", min_len_index))
    info.append(("min len", min_len))
    info.append(("min len value", min_len_column))
    info.append(("max len index", max_len_index))
    info.append(("max len", max_len))
    info.append(("max len value", max_len_column))   
    show(info)
    return

        
def header_info(filename, info = None):
    header = get_header(filename)
    columns = line_info(header, info)
    return columns

def get_line_count(filename) -> int:
    f = open(filename, "r")
    count = 0
    while True:
        s = f.readline() 
        if s != "":
            count += 1
        else:
            break
        
    f.close()
    return count


def get_col_count(filename) -> int:
    return len(get_line(filename, 0))


def check_columns(filename, target_col_num):
    f = open(filename, "r")
    while True:
        s = f.readline()
        if s == "":
            return True
        elif len(split_line(s)) not in [target_col_num]:
            return False
    
def show(info):
    for line in info:
        print("{}: {}".format(line[0], line[1]))

In [4]:
file = "data\\ukb44409_100.tab"
header = get_header(file)
print(len(header))
get_info(header)
count = get_line_count(file)
print("lines: ", count)
data_field = "f.21000.0.0"
col_index = get_column_index(file, data_field)
print("col index of [{}] is {}".format(data_field, col_index))
if(col_index > -1):
    cols = get_column(file, col_index)
    print(cols)
#print(len(cols))

18152
total count: 18152
different values: 18152
min len index: 0
min len: 5
min len value: f.eid
max len index: 9158
max len: 13
max len value: f.20198.0.100
lines:  101
col index of [f.21000.0.0] is 9784
['1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '4002', '1001', '1001', '1001', '4002', '1001', '1001', '1001', '1001', '1001', '1001', '1002', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1003', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '3004', '4001', '1001', '1001', '1001', '1001', '1001', '1001', '4002', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '3004', '1001', '2003', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '1001', '

In [23]:
file = "data\\sex_pca_kinship.txt"
header = get_header(file)
print(header)
get_info(header)
count = get_line_count(file)
print("lines: ", count)
data_field = "f.22021.0.0"
col_index = get_column_index(file, data_field)
print("col index of [{}] is {}".format(data_field, col_index))
if(col_index > -1):
    cols = get_column(file, col_index)
    print(cols)
#print(len(cols))

['f.eid f.31.0.0 f.22001.0.0 f.22009.0.1 f.22009.0.2 f.22009.0.3 f.22021.0.0']
total count: 1
different values: 1
min len index: 0
min len: 74
min len value: f.eid f.31.0.0 f.22001.0.0 f.22009.0.1 f.22009.0.2 f.22009.0.3 f.22021.0.0
max len index: None
max len: 0
max len value: 
lines:  502494
col index of [f.22021.0.0] is -1


In [11]:
data_fields = ['f.22009.0.1', 'f.22009.0.2', 'f.22009.0.3', 'f.22009.0.4', 'f.22009.0.5', 'f.22009.0.6', 'f.22009.0.7', 'f.22009.0.8', 'f.22009.0.9', 'f.22009.0.10', 'f.22009.0.11', 'f.22009.0.12', 'f.22009.0.13', 'f.22009.0.14', 'f.22009.0.15', 'f.22009.0.16', 'f.22009.0.17', 'f.22009.0.18', 'f.22009.0.19', 'f.22009.0.20', 'f.22009.0.21', 'f.22009.0.22', 'f.22009.0.23', 'f.22009.0.24', 'f.22009.0.25', 'f.22009.0.26', 'f.22009.0.27', 'f.22009.0.28', 'f.22009.0.29', 'f.22009.0.30', 'f.22009.0.31', 'f.22009.0.32', 'f.22009.0.33', 'f.22009.0.34', 'f.22009.0.35', 'f.22009.0.36', 'f.22009.0.37', 'f.22009.0.38', 'f.22009.0.39', 'f.22009.0.40']


for data_field in data_fields:
    col_index = get_column_index(file, data_field)
    #print("col index of [{}] is {}".format(data_field, col_index))
    cols = get_column(file, col_index)
    print(type(cols), len(cols))

<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100
<class 'list'> 100


In [14]:
pca_snp_file = "snp_pca_map.txt"
header = get_header(pca_snp_file)
print(header)
count = get_line_count(pca_snp_file)
print(count)

['1', 'rs116390263', '0', '772927', 'C', 'T', '0.96968', '0.000336452', '0.000893053', '-0.000262368', '-0.00191726', '0.000552108', '-0.00318146', '0.0013126', '-0.00130153', '-0.0016725', '-0.00303943', '-0.00281615', '0.00322987', '-0.00288765', '-0.00443558', '-0.00278874']
101284


In [4]:
file = "ukb44409_100.tab"
header = get_header(file)
#print(header)
#get_info(header)
count = get_line_count(file)
print(count)
col_index = get_column_index(file, 'f.22001.0.0')
print(col_index)
cols = get_column(file, col_index)
print(cols)
print(len(cols))

101
9995
['1', '0', '1', 'NA', '0', '0', '1', '1', '1', '0', '0', '0', '0', '0', '0', '1', '1', '0', '0', '0', '1', '0', '1', '1', '1', '0', '0', '1', '1', '0', '0', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '0', '1', '0', '0', '0', '0', 'NA', '1', '0', '0', 'NA', '0', '1', '1', 'NA', '0', '0', '0', '0', '0', '0', 'NA', '1', '1', '0', '1', '1', '0', '1', '1', '0', '1', '1', '1', '0', '0', '1', '1', '0', '1', '0', '0', '1', '1', '1', '1', '0', '0', '1', '1', '0', '0', '0', 'NA', '0', '0', '1']
100


In [9]:
file = "ukb44409_100.tab"
header = get_header(file)
#print(header)
#get_info(header)
count = get_line_count(file)
print(count)
#col_index = get_column_index(file, 0)
#print(col_index)
cols = get_column(file, 1)
print(cols)
print(len(cols))

101
['458', '821', '719', '694', '1323', '753', '457', '407', '297', '332', '290', '437', '493', '357', '583', '260', '454', '453', '391', '363', '445', '545', '431', '198', '632', '357', '502', '239', '403', '381', '458', '912', '572', '390', '486', '609', '558', '599', '406', '300', '670', '326', '754', '451', '410', '425', '534', '410', '756', '552', '401', '330', '619', '482', '415', '255', '652', 'NA', '1000', '659', '332', '455', '242', '237', '653', '441', '466', '540', '481', '384', '299', '388', '374', '661', '588', '841', '774', '493', '521', '406', '1070', '666', '978', '321', '484', '320', '621', '315', '1050', '321', '354', '605', '362', '553', '676', '341', '382', '659', '412', '445']
100


In [14]:
line1 = get_header(file, True)
print(header[0:10])
get_info(line1)

['f.eid', 'f.3.0.0', 'f.3.1.0', 'f.3.2.0', 'f.3.3.0', 'f.4.0.0', 'f.4.1.0', 'f.4.2.0', 'f.4.3.0', 'f.5.0.0']
total count: 18152
different values: 18152
min len index: 0
min len: 5
min len value: f.eid
max len index: 9158
max len: 13
max len value: f.20198.0.100


In [22]:
sex_file = "sex31_all.txt"
header = get_header(sex_file)
print(header)
get_info(header)
count = get_line_count(sex_file)
print(count)
col_index = 0
print(col_index)
cols = get_column(sex_file, col_index)
#print(cols)
print(len(cols))
sex_stat = get_freq(cols)
print(sex_stat)

['f.31.0.0']
total count: 1
different values: 1
min len index: 0
min len: 8
min len value: f.31.0.0
max len index: None
max len: 0
max len value: 
502494
0
502493
{'NA': 1, '0': 273377, '1': 229115}


In [79]:
type("asd") in [int, str]

True

In [15]:
sex_file = "eid_two_sexes_all.txt"
import pandas as pd

ImportError: No module named 'pandas'

In [13]:
int("12")

12