In [1]:
import pandas as pd
import pickle

from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [2]:
df = pd.read_csv('../datasets/malicious.csv')

benign_subset = df[df['type'] == 'benign']
malicious_subset = df[(df['type'] == 'malware') | (df['type'] == 'defacement')]
benign_subset['type'] = benign_subset['type'].replace({'benign': 'good'})

df = pd.concat([benign_subset, malicious_subset])

df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benign_subset['type'] = benign_subset['type'].replace({'benign': 'good'})


Unnamed: 0,url,type
1,mp3raid.com/music/krizz_kaliko.html,good
2,bopsecrets.org/rexroth/cr/1.htm,good
5,http://buzzfil.net/m/show-art/ils-etaient-loin...,good
6,espn.go.com/nba/player/_/id/3457/brandon-rush,good
7,yourbittorrent.com/?q=anthony-hamilton-soulife,good


In [3]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

df['text_tokenized'] = df.url.map(lambda t: tokenizer.tokenize(t))

In [4]:
stemmer = SnowballStemmer('english')

df['text_stemmed'] = df['text_tokenized'].map(lambda l: [stemmer.stem(word = word) for word in l])

In [5]:
df['text_sent'] = df['text_stemmed'].map(lambda l: ' '.join(l))

df.head(5)

Unnamed: 0,url,type,text_tokenized,text_stemmed,text_sent
1,mp3raid.com/music/krizz_kaliko.html,good,"[mp, raid, com, music, krizz, kaliko, html]","[mp, raid, com, music, krizz, kaliko, html]",mp raid com music krizz kaliko html
2,bopsecrets.org/rexroth/cr/1.htm,good,"[bopsecrets, org, rexroth, cr, htm]","[bopsecret, org, rexroth, cr, htm]",bopsecret org rexroth cr htm
5,http://buzzfil.net/m/show-art/ils-etaient-loin...,good,"[http, buzzfil, net, m, show, art, ils, etaien...","[http, buzzfil, net, m, show, art, il, etaient...",http buzzfil net m show art il etaient loin de...
6,espn.go.com/nba/player/_/id/3457/brandon-rush,good,"[espn, go, com, nba, player, id, brandon, rush]","[espn, go, com, nba, player, id, brandon, rush]",espn go com nba player id brandon rush
7,yourbittorrent.com/?q=anthony-hamilton-soulife,good,"[yourbittorrent, com, q, anthony, hamilton, so...","[yourbittorr, com, q, anthoni, hamilton, soulif]",yourbittorr com q anthoni hamilton soulif


In [6]:
cv = CountVectorizer()

feature = cv.fit_transform(df.text_sent)

feature[:5].toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [7]:
trainX, testX, trainY, testY = train_test_split(feature, df.type)

In [8]:
lr = LogisticRegression()

lr.fit(trainX, trainY)

lr.score(testX, testY)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9972732100058841

In [9]:
mnb = MultinomialNB()

mnb.fit(trainX, trainY)

mnb.score(testX, testY)

0.9744040528710228

In [10]:
pipeline_ls = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize, stop_words = 'english'), LogisticRegression())

trainX, testX, trainY, testY = train_test_split(df.url, df.type)

pipeline_ls.fit(trainX, trainY)

pipeline_ls.score(testX, testY)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9974310767950172

In [11]:
pickle.dump(pipeline_ls, open('../models/malware-defacement.pkl', 'wb'))

In [12]:
loaded_model = pickle.load(open('../models/malware-defacement.pkl', 'rb'))

test = ['kaspi.kz', 'avtobys.kz', 'rutracker.ru/mal.exe', 'youtube.com']

tokenizer = RegexpTokenizer(r'[A-Za-z]+')
stemmer = SnowballStemmer('english')

tokenized = [tokenizer.tokenize(t) for t in test]
stemmed = [[stemmer.stem(word=word) for word in l] for l in tokenized]
sent = [' '.join(l) for l in stemmed] 

print(loaded_model.predict(sent))

['good' 'good' 'malware' 'good']
