First import necessary packages

In [49]:
import pandas as pd
import numpy as np

In [12]:
def splice_from_file(filename):
    """
    Load the genome splice dataset to pandas DataFrame
    """
    df = pd.read_csv(filename)
    return df

In [200]:
def split_features(data):
    #data contains columns class, id and dna
    X = data
    #split nucleotid string (len=60) into a list of independent characters (DNA nucleotids)
    X['dna'] = X['dna'].map(lambda x : list(str(x).strip()))
    #create 60 new attributes (columns) for each DNA nucleotide index
    #each attribute has name dna_idx where idx is index (1-based) in the list above
    for idx in range(60):
        X['dna_%d' % (idx+1)] = X['dna'].map(lambda x : x[idx])
    #remove the old dna column (redundant information)
    del X['dna']
    #remove descriptor
    del X['id']
    
    return X

In [201]:
def get_splice_data():
    """
    Load the genome splice dataset, split it into X and y, and then call the label encoder
    to get an integer y column
    """
    
    df = pd.read_csv('splice.csv')
    #exclude class column to create X dataframe
    X = split_features(df)
    X.to_csv("splice_indexed.csv", index=False)
    X = df.drop(columns='class')
    #print(X.values)
    #dv = feature_extraction.DictVectorizer(sparse=False)
    #dv.fit(X)
    print(pd.get_dummies(X))

    
    y = df.reindex(columns=['class'])
    #print(y.values.reshape(-1,))
    y = preprocessing.LabelEncoder().fit_transform(y.values.reshape(-1,))
    
    print(y)
    
    return X,y

In [202]:
def separate_feature_class(data):
    y = data.reindex(columns=['class'])
    X = data.drop(columns='class')
    return X,y

In [203]:
def count_unique_percent(df):
    """returns fractions of the unique values in the dataset"""
    #flatten DataFrame to one dimensional array and convert it to Series object
    series = pd.Series(df.as_matrix().reshape(-1))
    #count unique values percentage
    series.value_counts()
    unique_counts_pct = series.value_counts(normalize=False)
    return unique_counts_pct

def get_odd_nucleotide_rows(df):
    """creates dictionary of rows containing odd (non ATCG nucleotide)
       key - non ATCG nucleotide
       value - list of rows containing non ATCG nucleotide
    """
    
    odd_dict = {}
    odd_dict['N'] = df[df == 'N'].dropna(how='all').index.values.reshape(-1)
    odd_dict['R'] = df[df == 'R'].dropna(how='all').index.values.reshape(-1)
    odd_dict['S'] = df[df == 'S'].dropna(how='all').index.values.reshape(-1)
    odd_dict['D'] = df[df == 'D'].dropna(how='all').index.values.reshape(-1)
    return odd_dict

def remove_odd_nucleotide_rows(df, odd_rows_dict):
    #concat dictionary values
    to_remove_idx = np.concatenate(list(odd_rows_dict.values()))
    return df.drop(index = to_remove_idx.tolist())

In [204]:
df = splice_from_file("data/splice_orig.csv")
df.columns = ['class', 'id', 'dna']

df.info()
df_split = split_features(df)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3189 entries, 0 to 3188
Data columns (total 3 columns):
class    3189 non-null object
id       3189 non-null object
dna      3189 non-null object
dtypes: object(3)
memory usage: 74.8+ KB


In [205]:
df_split_X, df_split_y = separate_feature_class(df_split)
count_unique_percent(df_split_X)



C    50281
G    50226
T    46298
A    44475
N       56
D        2
S        1
R        1
dtype: int64

In [206]:
odd_rows_dict = get_odd_nucleotide_rows(df_split_X)
df_filt_X = remove_odd_nucleotide_rows(df_split_X, odd_rows_dict)
df_filt_y = remove_odd_nucleotide_rows(df_split_y, odd_rows_dict)