In [152]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold #Кросс-валидация
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import csv

### Обработка данных

In [144]:
d1 = []
d2 = []
with open('sqli.csv', newline='') as File:
    reader = csv.reader(File)
    for row in reader:
        d1.append(row[0])
        try:
            if row[1] != '1':
                d2.append('NaN')
            else:
                d2.append(int(row[1]))
        except:
            d2.append('NaN')
        

d1.pop(0)
d2.pop(0)
d1 = pd.Series(d1)
d2 = pd.Series(d2)
data = pd.concat([d1, d2],axis=1)
data.columns = ['Sentence', 'Label']

data[data[:1129] == 'NaN'] = 1
data[data[1129:] == 'NaN' ] = 0

y = data['Label']
X = data['Sentence']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train[X_train =='?'] = 'Hello World'

### Метод SVM

In [142]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)   
y_train = y_train.astype('int')
X_test = vectorizer.transform(X_test)
y_test = y_test.astype('int')

#Cетка параметров для перебора 10^-5 ... 10^5 
grid = {'C': np.power(10.0, np.arange(-5, 6))}

#Кросс-валидация по 5 блокам
cv = KFold(n_splits=5, shuffle=True, random_state=0)

#Обучение и подбор параметра С
clf = SVC(kernel='linear', random_state=0)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv) #Подбор параметров (В нашем случае C)
gs.fit(X_train,y_train)
gs.cv_results_



{'mean_fit_time': array([0.28544722, 0.2830142 , 0.31770768, 0.36191635, 0.30815916,
        0.27179151, 0.27535672, 0.2846642 , 0.30900798, 0.29112082,
        0.29640036]),
 'std_fit_time': array([0.01179549, 0.00653197, 0.02894554, 0.00959683, 0.01480573,
        0.00683223, 0.02553393, 0.07854771, 0.11379821, 0.07997918,
        0.09766268]),
 'mean_score_time': array([0.05980897, 0.06121163, 0.06243014, 0.07401166, 0.06209064,
        0.05300856, 0.05077157, 0.05668082, 0.04621453, 0.04800191,
        0.04659925]),
 'std_score_time': array([0.00661104, 0.00331354, 0.00204604, 0.00275555, 0.01015962,
        0.00177086, 0.0038078 , 0.0333547 , 0.01774084, 0.01888933,
        0.01889677]),
 'param_C': masked_array(data=[1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                    1000.0, 10000.0, 100000.0],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False],
        fill_value='?',
             dtype=object)

In [145]:
#ВЫВОД Параметр С=10 дает лучший результат на 5 выборках

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)   
y_train = y_train.astype('int')
X_test = vectorizer.transform(X_test)
y_test = y_test.astype('int')

clf = SVC(kernel='linear', random_state=0, C=10) #С=1.0 по умолчанию
clf.fit(X_train,y_train)

#10 слов с наибольшим абсолютным значением веса
#Находим слова с наиболее большим весом из get_feature_names()
df = pd.DataFrame(np.transpose(abs(clf.coef_.toarray())), #Берем по модулю потому что 2 класса todense() переводит матрицу из разряженной в обычную 
                   index=np.asarray(vectorizer.get_feature_names()), 
                   columns=["col"])


### Вывод опорных слов (векторов)

In [149]:
df_sort = df.sort_values(by='col')[::-1]
display(df_sort[:30].sort_index())

Unnamed: 0,col
0x730065006c0065006300740020004000400076006500,1.628849
0x770061006900740066006f0072002000640065006c00,1.629032
10000000,2.630686
1s,1.628964
21,1.628968
26,1.62891
28,1.629083
admin,1.628864
delete,1.628926
distinct,1.628852


### Вывод и предсказание
    Accuracy (доля верно угаданных) — sklearn.metrics.accuracy_score
    
    Precision (точность) — sklearn.metrics.precision_score
    
    Recall (полнота) — sklearn.metrics.recall_score
    
    F-мера — sklearn.metrics.f1_score

In [160]:
print(clf.score(X_test, y_test)) #оценка 0.97

print('Доля верно угаданных: ',accuracy_score(y_test, clf.predict(X_test))) 
print('F-мера: ', f1_score(y_test, clf.predict(X_test))) 
print('Точность: ', precision_score(y_test, clf.predict(X_test)))
print('Полнота: ',recall_score(y_test, clf.predict(X_test)))
print('ROC-AUC: ',roc_auc_score(y_test, clf.decision_function(X_test)))

X_test2 = ["SELECT `name`, `status`, `books` FROM `members` WHERE name = 'Demo' AND password ='111'"]
X_test2 = vectorizer.transform(X_test2)
print(clf.predict(X_test2))

X_test3 = ['Let me introduce myself. My name is Mariya I am a 20-year-old student from Donetsk.']
X_test3 = vectorizer.transform(X_test3)
print(clf.predict(X_test3))

0.9704198473282443
Доля верно угаданных:  0.9704198473282443
F-мера:  0.9460869565217391
Точность:  0.9963369963369964
Полнота:  0.9006622516556292
ROC-AUC:  0.9937281394812066
[1]
[0]
