In [None]:
!pip install GEOparse #needed to get the relevant data from GEO (Gene Expression Omnibus)
import GEOparse
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [None]:
raw_data = GEOparse.get_GEO(geo = "GDS2771", destdir = os.getcwd()) 
"""Analysis of large airway epithelial cells from cigarette smokers without cancer, 
with cancer, and with suspect lung cancer. Results provide insight into the 
feasibility of using gene expression to detect early stage lung cancer in smokers. Gene expression profile of the different
groups (cancer vs non-cnacer). Hopefully we can use the gene expression matrix
(numerical value for eacg gene) to classify patiants with and without cancer"""

In [None]:
data = raw_data.table.set_index('ID_REF')
data.shape ## data has 193 patiants (columns) and 22283 gene expression values for each patiant (rows)

In [None]:
data = data.dropna() # clean NA
data.shape

In [None]:
cancer_stat = raw_data.columns['disease state'] # this is what I'm trying to predict
identifier = data['IDENTIFIER']
data = data.drop('IDENTIFIER', axis=1) # irrelevant, need to clean this

In [None]:
data = data.T[(cancer_stat!="suspect cancer").values]
cancer_stat = cancer_stat[(cancer_stat!="suspect cancer").values] # remove 'suspect cacner'. I will only try to predict cancer vs. non-cancer

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, cancer_stat, test_size=0.2,random_state=1) # train test split

In [None]:
model = SVC()
model.fit(X_train,y_train)
preds = model.predict(X_test)

In [None]:
con = pd.DataFrame(confusion_matrix(y_train,model.predict(X_train)),index = ["cancer","no cancer"], columns = ["cancer","no cancer"])
con.index.name = "class"
con.columns.name = "preds"
con

In [None]:
tp = con['cancer'][0]/(con['no cancer'][0]+con['cancer'][0])
tn = con['no cancer'][1]/(con['no cancer'][1]+con['cancer'][1])
fp = con['cancer'][1]/(con['no cancer'][1]+con['cancer'][1])
fn = con['no cancer'][0]/(con['no cancer'][0]+con['cancer'][0])
accuracy = (con['cancer'][0]+con['no cancer'][1])/(con.sum()[0:2].sum())
print('True Positive Rate: {} \nTrue Negative Rate: {} \nFalse Positive Rate: {} \n\
False Negative Rate: {} \nAccuracy: {}'.format(tp, tn, fp, fn, accuracy))