# Machine Learning (SVM & Random Forest)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
#pd.set_option('display.max_rows', 500)

#### Import dataset

In [3]:
ngs_reads = pd.read_json('exported_datasets/sau.json')

In [4]:
ngs_reads.reset_index(drop=True, inplace=True)

In [5]:
# convert dict type columns to dataframes
for category in ngs_reads:
    if type(ngs_reads[category][0]) is dict:
        ngs_reads[category] = ngs_reads[category].apply(pd.DataFrame)

#### extract target variable

In [6]:
target = ngs_reads.evaluation.values
ngs_reads.drop(columns='evaluation', inplace=True)
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

#### remove unfinished columns from dataset

In [7]:
cols = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
ngs_reads.drop(ngs_reads.columns[cols], axis=1, inplace=True)

In [8]:
ngs_reads.drop(columns=['organism', 'technology', 'read_number'], inplace=True)
ngs_reads

Unnamed: 0,total_sequences,percent_gc,min_sequence_length,max_sequence_length,status_module_0,status_module_1,status_module_2,status_module_3,status_module_4,status_module_5,status_module_6,status_module_7,status_module_8,status_module_9,status_module_10
0,511669,34,35,301,2,0,2,2,0,2,2,1,2,2,2
1,220494,34,35,301,2,2,1,2,0,2,2,1,2,2,2
2,6015314,65,35,301,2,0,1,2,0,1,2,1,1,2,2
3,1916272,35,35,301,2,2,2,2,0,2,2,1,1,2,2
4,334,35,69,301,2,0,0,2,0,0,2,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,1210750,34,35,251,2,2,2,2,0,2,2,1,1,2,2
74,1323948,34,35,251,2,2,2,2,0,2,2,1,1,2,2
75,1269962,34,35,251,2,2,2,2,0,2,2,1,1,2,2
76,1210750,34,35,251,2,2,2,2,0,2,2,1,1,2,2


#### train-test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(ngs_reads, target, test_size=0.2,random_state=109)

#### Support Vector Machine Training

In [12]:
clf_svc = svm.SVC(kernel='linear')

In [13]:
clf_svc.fit(X_train, y_train)

SVC(kernel='linear')

In [14]:
y_pred=clf_svc.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.625


#### Random Forest Training

In [15]:
clf_rf=RandomForestClassifier(n_estimators=100, verbose=True)

In [16]:
clf_rf.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


RandomForestClassifier(verbose=True)

In [17]:
y_pred=clf_rf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 1.0


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
