# SVM

In machine learning, support vector machines (SVM) are supervised learning models with associated learning algorithms that analyze data used for classification and regression analysis. Given a set of training examples, each marked as belonging to one or the other of two categories, an SVM training algorithm builds a model that assigns new examples to one category or the other, making it a non-probabilistic binary linear classifier. 

<b> Imports

In [1]:
from sklearn import preprocessing, cross_validation, neighbors, svm
from os import listdir
from os.path import isfile, join
import numpy as np
import geopandas as gpd
import pandas as pd
import requests
import io



<b> Set Paths

In [2]:
falsospath = "/home/sansigolo/Documents/git/CAP-240-394/src/SVM/FALSOS_221_067-09-26/"

queimadaspath = "/home/sansigolo/Documents/git/CAP-240-394/src/SVM/QUEIMADAS_221_067-09-26/"

<b> Get Filenames

In [3]:
falsosfilenames = [y for y in listdir(falsospath) for ending in ['dbf', 'shp', 'prj', 'shx'] if y.endswith(ending)] 

queimadasfilenames = [y for y in listdir(queimadaspath) for ending in ['dbf', 'shp', 'prj', 'shx'] if y.endswith(ending)] 

<b> Print Filenames

In [4]:
print(falsosfilenames)
print("")
print(queimadasfilenames)

['FALSOS_221_067-09-26.prj', 'FALSOS_221_067-09-26.shx', 'FALSOS_221_067-09-26.dbf', 'FALSOS_221_067-09-26.shp']

['QUEIMADAS_221_067-09-26.shx', 'QUEIMADAS_221_067-09-26.dbf', 'QUEIMADAS_221_067-09-26.prj', 'QUEIMADAS_221_067-09-26.shp']


<b> Set the Shapefile

In [5]:
f_dbf, f_shp, f_prj,  f_shx = [falsosfilename for falsosfilename in falsosfilenames]

falsos = gpd.read_file(falsospath+f_shp)

<b> Set the Shapefile

In [6]:
q_dbf, q_shp, q_prj,  q_shx = [queimadasfilename for queimadasfilename in queimadasfilenames]

queimadas = gpd.read_file(queimadaspath+q_shp)

<b> Print Shape

In [7]:
print("\nFalsos Shape: {}".format(falsos.shape)+"\n")

print("Queimadas Shape: {}".format(queimadas.shape))


Falsos Shape: (53, 36)

Queimadas Shape: (1213, 36)


<b> Create Dataframe

In [8]:
df = pd.concat([falsos, queimadas], ignore_index=True)
df.replace('?', -99999, inplace=True)

<b> Print Dataframe tail

In [9]:
print("\nDataframe tail:\n", df.tail())


Dataframe tail:
            id  cod_sat                cena_id                      nome_arq  \
1261  7097709        8  LC82210672017189LGN00  LC82210672017189LGN00.tar.gz   
1262  7097583        8  LC82210672017189LGN00  LC82210672017189LGN00.tar.gz   
1263  7098332        8  LC82210672017189LGN00  LC82210672017189LGN00.tar.gz   
1264  7097780        8  LC82210672017189LGN00  LC82210672017189LGN00.tar.gz   
1265  7097382        8  LC82210672017189LGN00  LC82210672017189LGN00.tar.gz   

                     data_pas  orb_pto    area_ha  perim versao  \
1261  2017/07/08 00:00:00.000  221_067   2.706982   1140    3.3   
1262  2017/07/08 00:00:00.000  221_067   1.985133    840    3.3   
1263  2017/07/08 00:00:00.000  221_067  24.708948   7440    3.3   
1264  2017/07/08 00:00:00.000  221_067   0.992043    480    3.3   
1265  2017/07/08 00:00:00.000  221_067   0.541506    360    3.3   

                         n_arq_ant  \
1261  LC82210672017173LGN00.tar.gz   
1262  LC82210672017173LGN00.

<b> Drop the columns

In [10]:
df.drop(['id','cod_sat','cena_id', 'nome_arq', 'data_pas', 'orb_pto', 'versao', 'n_arq_ant', 'medianb1', 'data_inser', 'fid_1', 'data_proc','maquina', 'proc_id','valida_web', 'user_id', 'data_valid', 'data_visua', 'visualizac', 'visualizad', 'geometry'], 1, inplace=True)

<b> Print Dataframe tail

In [11]:
print("\nDataframe tail:\n", df.tail())


Dataframe tail:
         area_ha  perim      ndvi      nbrl  dif_ndvi  dif_dnbrl  medianb2  \
1261   2.706982   1140  0.285590  0.045397  0.205554   0.279453  0.102393   
1262   1.985133    840  0.263258  0.050362  0.192967   0.233434  0.113864   
1263  24.708948   7440  0.230180  0.007113  0.236502   0.286259  0.103720   
1264   0.992043    480  0.321351  0.145861  0.211881   0.221723  0.097045   
1265   0.541506    360  0.321921  0.137134  0.165568   0.175336  0.094391   

      medianb3  medianb4  medianb5  medianb6  medianb7  verifica  lim_ndvi  \
1261  0.090342  0.097874  0.179868  0.213479  0.159705         1       0.2   
1262  0.103291  0.113283  0.189169  0.224646  0.173663         1       0.2   
1263  0.091710  0.098897  0.164570  0.193979  0.155794         1       0.2   
1264  0.078498  0.077227  0.142292  0.145304  0.104038         1       0.2   
1265  0.071837  0.065065  0.125804  0.130517  0.097017         1       0.2   

      lim_nbrl  
1261       0.5  
1262       0.5  

<b> Set X and y

In [12]:
X = np.array(df.drop(['verifica'],1))
y = np.array(df['verifica'])

<b> Make the Train and Test

In [13]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)

<b> Run the SVM

In [14]:
clf = svm.SVC()
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

<b>Print Accuracy

In [15]:
accuracy = clf.score(X_test, y_test)

print('Accuracy: ', accuracy)

Accuracy:  0.968503937007874
