### Parked domain classifier
1. Generate [parked domain sample](parked_domain_classifier.ipynb)
2. Generate features for all URL lists according to the parked domain classifier from [Vissers et. al., 2015](https://github.com/flaiming/Domain-Parking-Sensors)
3. Train classifier (below)
4. Breakdown unreliable parked domains by source list
5. Calculate overlap of classifier with 404 response codes (after running [requests.ipynb](requests.ipynb))

In [1]:
CSV_PARKED = 'url_list/parked_features.csv'
CSV_BENIGN = 'url_list/common_crawl_features.csv'

cc_neg = ['www.eccentricbliss.com','ehavadar.com','raptorweb.com.ar','mediafile.in','www.gz9m.com','h-ero.com','www.mymoviesstore.in','www.bastler-shop.net','scottsdalefineproperties.com','fastperfekt-zum.com','otdyhkafe.ru','ketopialife.com','cmg.net']

In [2]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import svm
import numpy as np
from catboost import Pool, cv, CatBoostClassifier
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, accuracy_score

clfs = {
    "Boosted DT": CatBoostClassifier(verbose=False, random_state=0),
    "RF": RandomForestRegressor(n_estimators=50, random_state=0),
    # "DT": DecisionTreeClassifier(random_state=0),
    # "MLP": MLPClassifier(random_state=0, max_iter=200, hidden_layer_sizes=(200,200)),
    # "SVM": svm.SVC(kernel='poly'),
}

kf = KFold(n_splits=5, shuffle=True)

negs = pd.read_csv(CSV_BENIGN)
negs['label'] = 0

pos = pd.read_csv(CSV_PARKED)
pos['label'] = 1
for url in cc_neg:
    negs[negs['Website'] == url] = 1

df = pd.concat([negs, pos])

features = df.drop(columns=['class', 'Website'])

_X = features.drop(columns=['label']).to_numpy()
_y = features['label'].to_numpy()

f1s = []
accs = []

for train_index, test_index in kf.split(_X, _y):
    X_train, X_test = _X[train_index], _X[test_index]
    y_train, y_test = _y[train_index], _y[test_index]
    clf = clfs['Boosted DT']

    clf = clf.fit(X_train, y_train)
    scores = clf.predict(X_test)
    f1s.append(f1_score(y_test, np.rint(scores)))
    accs.append(accuracy_score(y_test, np.rint(scores)))
    print(str(np.mean(accs)) + f", F1: {np.mean(f1s)}")


0.9373040752351097, F1: 0.8958333333333334
0.9420062695924765, F1: 0.9029431216931216
0.9456635318704284, F1: 0.9096543888210554
0.9498432601880877, F1: 0.9166157916157915
0.94858934169279, F1: 0.915110815110815


In [12]:
news_urls = pd.read_csv('url_list/news_features.csv')
news_df = news_urls.drop(columns=['class', 'Website'])
news_urls['pred'] = clf.predict(news_df)
news_urls.groupby('pred').describe()

Unnamed: 0_level_0,TP_DataRatio,TP_DataRatio,TP_DataRatio,TP_DataRatio,TP_DataRatio,TP_DataRatio,TP_DataRatio,TP_DataRatio,TP_HtmlRatio,TP_HtmlRatio,...,typoDomain,typoDomain,windowLocation,windowLocation,windowLocation,windowLocation,windowLocation,windowLocation,windowLocation,windowLocation
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
pred,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,2361.0,0.533465,0.336974,0.0,0.228195,0.522089,0.888205,1.0,2361.0,0.609093,...,0.0,0.0,2361.0,0.927573,1.679969,0.0,0.0,1.0,1.0,25.0
1,149.0,0.83207,0.3079,0.0,0.813925,0.992952,1.0,1.0,149.0,0.816956,...,0.0,0.0,149.0,0.42953,0.510101,0.0,0.0,0.0,1.0,2.0


In [13]:
news_sources = pd.read_csv('url_list/final_4.2k_attrs.csv')
news_urls.rename(columns={"Website": "url"}, inplace=True)
clf_df = pd.merge(news_urls[['url', 'pred']], news_sources[['url', 'source', 'backlinks']], on='url', how='inner')
result = clf_df.groupby(['source', 'pred'])['source'].count()
result.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

source     pred
Blacklist  0       56.701031
           1       43.298969
Buzzfeed   0       53.125000
           1       46.875000
MBFC       0       99.058577
           1        0.941423
MBFC-Q     0       95.057034
           1        4.942966
Politico   0       63.157895
           1       36.842105
Snopes     0       84.375000
           1       15.625000
Name: source, dtype: float64

In [6]:
clf_df.groupby(['10k', 'pred']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,backlinks,backlinks,backlinks,backlinks,backlinks,backlinks,backlinks,backlinks
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
10k,pred,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
False,0,1810.0,12756230.0,74879410.0,10130.0,175042.75,798545.0,4017386.25,1723131000.0
False,1,18.0,6261926.0,23107100.0,19784.0,29126.5,511511.5,1450925.25,98736450.0
True,0,512.0,1317.629,2331.622,0.0,31.0,98.0,1386.5,9980.0
True,1,131.0,667.1069,1525.002,0.0,47.0,124.0,386.0,9984.0


In [8]:
responses = pd.read_csv('./url_list/response_codes.csv')
responses['response'] = 0
for row, label in enumerate(responses['response_code']):
    if label in ['404','exception']:
        responses['response'][row] = 1
    else:
        responses['response'][row] = 0
# responses[responses['response_code'] == '404']['response'] = 1

clf_res_df = pd.merge(responses[['url', 'response']], clf_df, on='url', how='inner')
clf_res_df.groupby(['response', 'pred']).describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  responses['response'][row] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  responses['response'][row] = 1


Unnamed: 0_level_0,Unnamed: 1_level_0,backlinks,backlinks,backlinks,backlinks,backlinks,backlinks,backlinks,backlinks
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
response,pred,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0,0,2307.0,9855517.0,66342560.0,0.0,20649.5,342444.0,2213172.0,1723131000.0
0,1,149.0,757060.8,8094611.0,0.0,57.0,156.0,1298.0,98736450.0
1,0,15.0,23518070.0,63017880.0,580.0,503364.0,2024182.0,8458530.5,247382900.0
