# Selecting data with feature selection and main effects removal

In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
import random as rnd
from os import fstat
from sys import exit
from pyplink import PyPlink

In [3]:
names = pd.read_csv("filtered_SNPs.csv")
names = list(names.iloc[:,1])
names

['rs1739855',
 'rs2296716',
 'rs4074196',
 'rs2377037',
 'rs7512482',
 'rs12562937',
 'rs2494626',
 'rs78504402',
 'rs11588930',
 'rs12753298',
 'rs12409875',
 'rs2477702',
 'rs3122923',
 'rs12406506',
 'rs11584295',
 'rs72848552',
 'rs28734787',
 'rs4310388',
 'rs114996135',
 'rs11590198',
 'rs11590361',
 'rs57572926',
 'rs10909851',
 'rs76031446',
 'rs897623',
 'rs56406535',
 'rs17387572',
 'rs61766528',
 'rs4648482',
 'rs897619',
 'rs12119163',
 'rs2925499',
 'rs2485944',
 'rs6691208',
 'rs1569419',
 'rs2993493',
 'rs12094678',
 'rs3002685',
 'rs6669407',
 'rs10492937',
 'rs2244013',
 'rs10909948',
 'rs1896',
 'rs2794327',
 'rs3765753',
 'rs7551237',
 'rs10909830',
 'rs12080516',
 'rs16840810',
 'rs7418714',
 'rs6426417',
 'rs17348839',
 'rs4654556',
 'rs1157688',
 'rs241224',
 'rs12122754',
 'rs4654454',
 'rs10915577',
 'rs16839450',
 'rs449154',
 'rs1120120',
 'rs9439532',
 'rs12045223',
 'rs499416',
 'rs11801594',
 'rs12724170',
 'rs12049337',
 'rs1081454',
 'rs2227911',
 'rs3789

In [4]:
df = pd.DataFrame()
with PyPlink("CD_UC_CON_QCed_rel1_without_relatives_maf0.05_hwe0.001_Liu2015_232SNPs_LD0.75_noFilter_binary") as bed:
        # GettinM
        bim = bed.get_bim()
        fam = bed.get_fam()
   
        # Iterating over all loci
        for loci_name, genotypes in bed:
            if loci_name in names:
                df[loci_name] = genotypes[:]

df["Y"] = fam["status"]

In [19]:
df

Unnamed: 0,rs1739855,rs2296716,rs4074196,rs2377037,rs7512482,rs12562937,rs2494626,rs78504402,rs11588930,rs12753298,...,rs28681372,rs11101958,rs882753,rs74798979,rs470119,rs4040041,rs9616810,rs9628185,rs9628187,Y
0,0,0,2,0,0,1,1,0,1,0,...,1,1,0,1,0,0,0,2,0,1
1,0,0,0,2,1,1,0,0,0,0,...,1,1,1,1,1,0,0,1,1,1
2,0,0,1,0,1,0,0,0,0,0,...,1,0,0,0,1,0,0,2,1,1
3,1,0,1,0,0,1,1,0,0,0,...,1,0,0,1,1,1,1,1,1,1
4,0,0,0,1,0,1,1,0,0,0,...,0,1,0,0,0,0,0,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66275,0,0,1,1,0,0,0,0,0,0,...,1,1,0,0,0,2,1,0,1,1
66276,0,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,2,0,1
66277,0,0,2,0,0,0,0,0,0,1,...,0,0,0,2,0,0,0,2,0,1
66278,0,2,0,0,0,0,1,0,1,0,...,1,0,0,0,0,1,0,1,1,1


# SELECTING DATA FROM ReliefF

In [42]:
ReliefF = pd.read_csv("Relieff_data\\relieff-scores-10-100-Complete_withANSWER_corrected_for_confounding_corrected_on_1st_column.csv",sep='\s+',skiprows=2)
ReliefF = ReliefF.iloc[:,0:2]
ReliefF

Unnamed: 0,===,SCORES
0,rs11805303,0.004511
1,rs7517847,0.004352
2,rs2066844,0.004236
3,rs11580078,0.003983
4,rs6669582,0.003608
...,...,...
38220,rs11783418,-0.002613
38221,rs2244894,-0.002733
38222,rs11991118,-0.002766
38223,rs11250076,-0.002876


Removing rows with strong main effect (those excluded by the p-value)

In [43]:
Rel_red = ReliefF.loc[ReliefF['==='].isin(names)]
Rel_red = Rel_red.iloc[:400,:] # taking top-400
Rel_red

Unnamed: 0,===,SCORES
12,rs11634560,0.002707
14,rs4982766,0.002651
23,rs10491640,0.002453
30,rs199529,0.002348
36,rs76200426,0.002296
...,...,...
809,rs6832480,0.001350
813,rs2404233,0.001347
815,rs9332575,0.001345
816,rs79309130,0.001344


In [44]:
names_rel = list(Rel_red.iloc[:,0])
names_rel = sorted(names_rel)
names_rel

['imm_2_234147184',
 'rs10028107',
 'rs10103667',
 'rs10104380',
 'rs1012281',
 'rs10146912',
 'rs10158092',
 'rs10159477',
 'rs10179521',
 'rs10195337',
 'rs1034610',
 'rs10485641',
 'rs10491020',
 'rs10491111',
 'rs10491548',
 'rs10491640',
 'rs10492937',
 'rs10497576',
 'rs10500471',
 'rs10503716',
 'rs10516988',
 'rs10519046',
 'rs1061768',
 'rs1065489',
 'rs10734255',
 'rs10778338',
 'rs1078109',
 'rs10795158',
 'rs10799916',
 'rs10808469',
 'rs10817960',
 'rs10834401',
 'rs10896012',
 'rs10912694',
 'rs10917700',
 'rs10933973',
 'rs10937387',
 'rs11038190',
 'rs11084402',
 'rs11103233',
 'rs11104967',
 'rs11117717',
 'rs11121035',
 'rs111245745',
 'rs11142947',
 'rs11160186',
 'rs11164623',
 'rs11235706',
 'rs11242895',
 'rs11246288',
 'rs112743107',
 'rs11524508',
 'rs11542462',
 'rs11583823',
 'rs11590361',
 'rs11619043',
 'rs11631432',
 'rs11634560',
 'rs1163787',
 'rs11644277',
 'rs11651246',
 'rs11667601',
 'rs11679061',
 'rs11730442',
 'rs11784358',
 'rs11784679',
 'rs11802

In [45]:
df = pd.DataFrame()
with PyPlink("CD_UC_CON_QCed_rel1_without_relatives_maf0.05_hwe0.001_Liu2015_232SNPs_LD0.75_noFilter_binary") as bed:
        # GettinM
        bim = bed.get_bim()
        fam = bed.get_fam()
   
        # Iterating over all loci
        for loci_name, genotypes in bed:
            if loci_name in names_rel:
                df[loci_name] = genotypes[:]

df["Y"] = fam["status"]

In [46]:
df

Unnamed: 0,rs11590361,rs10492937,rs241224,rs79479445,rs11121035,rs2273289,rs10158092,rs12049377,rs72649467,rs12725096,...,rs9612574,rs79858848,rs738791,rs5761256,rs2857644,rs2071747,rs3218339,rs763036,rs16993718,Y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,1,0,1
1,0,0,0,1,1,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,1
2,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,2,0,0,1,0,1
3,0,0,0,0,1,0,1,0,0,2,...,0,0,0,2,1,1,0,1,2,1
4,0,0,1,0,0,1,0,0,0,0,...,0,0,1,1,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66275,1,1,0,0,0,1,1,0,0,1,...,1,0,2,0,0,0,0,0,0,1
66276,0,1,0,0,0,0,0,1,1,0,...,1,0,0,1,0,0,0,0,1,1
66277,0,1,1,0,0,0,0,0,0,0,...,0,0,2,1,0,0,1,1,0,1
66278,0,1,0,1,1,0,0,0,0,0,...,0,1,0,1,0,0,0,1,1,1


In [47]:
for col in df:
    df[col] = df[col].astype(np.float64)
    
df.to_csv("ReliefF_output_top400_corrected.csv",index=False)

# SELECTING DATA FROM EPISCAN

In [6]:
episcan = pd.read_csv("snp_episcan_functional_corrected.csv",sep=',')
episcan

Unnamed: 0.1,Unnamed: 0,x
0,1,rs1990623
1,2,rs5743293
2,3,rs2066845
3,4,rs1420871
4,5,rs7205760
...,...,...
5537,5538,rs10922573
5538,5539,rs12211012
5539,5540,rs17022433
5540,5541,rs10875071


Removing rows with strong main effect (those excluded by the p-value)

In [7]:
Rel_red = episcan.loc[episcan['x'].isin(names)]
Rel_red = Rel_red.iloc[:400,:] # taking top-400
names_rel = list(Rel_red.iloc[:,1])
names_rel

['rs6500336',
 'rs16948451',
 'rs17496307',
 'rs12919875',
 'rs9391858',
 'rs2294883',
 'rs241407',
 'rs61815628',
 'rs61814969',
 'rs17427599',
 'rs17834885',
 'rs397081',
 'rs10947262',
 'rs2284190',
 'rs28394820',
 'rs6687015',
 'rs1044043',
 'rs7711099',
 'rs4959028',
 'rs4148872',
 'rs2233976',
 'rs241424',
 'rs12524487',
 'rs9268541',
 'rs2857210',
 'rs17552052',
 'rs9268365',
 'rs2523656',
 'rs2239701',
 'rs12528584',
 'rs6089949',
 'rs62358202',
 'rs11756897',
 'rs79086701',
 'rs3763366',
 'rs241425',
 'rs73999973',
 'rs3767218',
 'rs2516470',
 'rs12555131',
 'rs3132453',
 'rs6667202',
 'rs9262615',
 'rs241410',
 'rs719654',
 'rs76140443',
 'rs113592495',
 'rs2256974',
 'rs2239707',
 'rs395832',
 'rs9257799',
 'rs3094576',
 'rs241437',
 'rs6936863',
 'rs2075789',
 'rs9263966',
 'rs78395062',
 'rs112970572',
 'rs471942',
 'rs1324580',
 'rs7764819',
 'rs9784876',
 'rs28894977',
 'rs12742115',
 'rs12755215',
 'rs61738759',
 'rs111720341',
 'rs2621377',
 'rs2394882',
 'rs12488468',

In [8]:
df = pd.DataFrame()
with PyPlink("CD_UC_CON_QCed_rel1_without_relatives_maf0.05_hwe0.001_Liu2015_232SNPs_LD0.75_noFilter_binary") as bed:
        # GettinM
        bim = bed.get_bim()
        fam = bed.get_fam()
   
        # Iterating over all loci
        for loci_name, genotypes in bed:
            if loci_name in names_rel:
                df[loci_name] = genotypes[:]

df["Y"] = fam["status"]

In [9]:
df

Unnamed: 0,rs17401735,rs3767218,rs1796918,rs12742115,rs12755215,rs1885276,rs11209002,rs950031,rs11161574,rs6662887,...,rs8127483,rs7499,rs35883266,rs137585,rs738331,rs1569498,rs12627970,rs137698,rs2069235,Y
0,0,0,0,0,0,0,0,1,0,1,...,1,1,0,0,1,0,0,1,0,1
1,0,0,0,0,1,1,1,0,1,1,...,1,0,0,2,1,1,1,2,0,1
2,1,0,1,0,0,0,0,1,1,1,...,0,2,0,0,0,0,0,0,0,1
3,1,0,1,1,0,0,1,0,0,0,...,0,0,0,1,0,0,0,1,1,1
4,0,0,0,0,0,0,0,1,0,0,...,0,2,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66275,0,0,0,1,0,1,1,1,0,1,...,1,0,1,1,0,0,0,1,0,1
66276,0,0,0,0,0,0,0,1,1,0,...,0,1,0,0,0,0,0,0,0,1
66277,0,0,0,0,0,0,0,1,1,0,...,1,1,0,1,1,1,1,2,2,1
66278,0,0,0,0,0,1,1,0,1,1,...,0,0,0,1,1,1,1,1,1,1


In [10]:
for col in df:
    df[col] = df[col].astype(np.float64)
    
df.to_csv("Episcan_episcan_functional_corrected_top400.csv",index=False)