In [1]:
import pandas as pd
import pickle

from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [41]:
df = pd.read_csv('../datasets/malicious.csv')

benign_subset = df[df['type'] == 'benign'].sample(frac=0.25)
malicious_subset = df[(df['type'] == 'malware') | (df['type'] == 'defacement')]
benign_subset['type'] = benign_subset['type'].replace({'benign': 'good'})

df = pd.concat([benign_subset, malicious_subset])

df.head(5)

Unnamed: 0,url,type
395300,englishclub.com/english-language-history.htm,benign
210875,en.wikipedia.org/wiki/1979_Ottawa_Rough_Riders...,benign
161548,whmc.umsystem.edu/exhibits/ramsay/ramsay_crawf...,benign
86167,spoke.com/info/pC7sG0/JonWolter,benign
43025,acronyms.thefreedictionary.com/South+African+E...,benign


In [42]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

df['text_tokenized'] = df.url.map(lambda t: tokenizer.tokenize(t))

In [43]:
stemmer = SnowballStemmer('english')

df['text_stemmed'] = df['text_tokenized'].map(lambda l: [stemmer.stem(word = word) for word in l])

In [44]:
df['text_sent'] = df['text_stemmed'].map(lambda l: ' '.join(l))

df.head(5)

Unnamed: 0,url,type,text_tokenized,text_stemmed,text_sent
395300,englishclub.com/english-language-history.htm,benign,"[englishclub, com, english, language, history,...","[englishclub, com, english, languag, histori, ...",englishclub com english languag histori htm
210875,en.wikipedia.org/wiki/1979_Ottawa_Rough_Riders...,benign,"[en, wikipedia, org, wiki, Ottawa, Rough, Ride...","[en, wikipedia, org, wiki, ottawa, rough, ride...",en wikipedia org wiki ottawa rough rider season
161548,whmc.umsystem.edu/exhibits/ramsay/ramsay_crawf...,benign,"[whmc, umsystem, edu, exhibits, ramsay, ramsay...","[whmc, umsystem, edu, exhibit, ramsay, ramsay,...",whmc umsystem edu exhibit ramsay ramsay crawfo...
86167,spoke.com/info/pC7sG0/JonWolter,benign,"[spoke, com, info, pC, sG, JonWolter]","[spoke, com, info, pc, sg, jonwolt]",spoke com info pc sg jonwolt
43025,acronyms.thefreedictionary.com/South+African+E...,benign,"[acronyms, thefreedictionary, com, South, Afri...","[acronym, thefreedictionari, com, south, afric...",acronym thefreedictionari com south african eq...


In [45]:
cv = CountVectorizer()

feature = cv.fit_transform(df.text_sent)

feature[:5].toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [46]:
trainX, testX, trainY, testY = train_test_split(feature, df.type)

In [47]:
lr = LogisticRegression()

lr.fit(trainX, trainY)

lr.score(testX, testY)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9941865392111998

In [48]:
mnb = MultinomialNB()

mnb.fit(trainX, trainY)

mnb.score(testX, testY)

0.9667124286029051

In [49]:
pipeline_ls = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize, stop_words = 'english'), LogisticRegression())

trainX, testX, trainY, testY = train_test_split(df.url, df.type)

pipeline_ls.fit(trainX, trainY)

pipeline_ls.score(testX, testY)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9941526414806529

In [50]:
pickle.dump(pipeline_ls, open('../models/malware-defacement.pkl', 'wb'))

In [3]:
loaded_model = pickle.load(open('../models/malware-defacement.pkl', 'rb'))

test = ['kaspi.kz', 'avtobys.kz', 'rutracker.ru/mal.exe', 'youtube.com']

tokenizer = RegexpTokenizer(r'[A-Za-z]+')
stemmer = SnowballStemmer('english')

tokenized = [tokenizer.tokenize(t) for t in test]
stemmed = [[stemmer.stem(word=word) for word in l] for l in tokenized]
sent = [' '.join(l) for l in stemmed] 

print(loaded_model.predict(sent))

['benign' 'benign' 'malware' 'benign']
