In [1]:
#  ▄▄▄▄    ▄▄▄       ███▄    █  ▄████▄   ▒█████   ██▓     ▒█████   ███▄ ▄███▓ ▄▄▄▄    ██▓ ▄▄▄      
# ▓█████▄ ▒████▄     ██ ▀█   █ ▒██▀ ▀█  ▒██▒  ██▒▓██▒    ▒██▒  ██▒▓██▒▀█▀ ██▒▓█████▄ ▓██▒▒████▄    
# ▒██▒ ▄██▒██  ▀█▄  ▓██  ▀█ ██▒▒▓█    ▄ ▒██░  ██▒▒██░    ▒██░  ██▒▓██    ▓██░▒██▒ ▄██▒██▒▒██  ▀█▄  
# ▒██░█▀  ░██▄▄▄▄██ ▓██▒  ▐▌██▒▒▓▓▄ ▄██▒▒██   ██░▒██░    ▒██   ██░▒██    ▒██ ▒██░█▀  ░██░░██▄▄▄▄██ 
# ░▓█  ▀█▓ ▓█   ▓██▒▒██░   ▓██░▒ ▓███▀ ░░ ████▓▒░░██████▒░ ████▓▒░▒██▒   ░██▒░▓█  ▀█▓░██░ ▓█   ▓██▒
# ░▒▓███▀▒ ▒▒   ▓▒█░░ ▒░   ▒ ▒ ░ ░▒ ▒  ░░ ▒░▒░▒░ ░ ▒░▓  ░░ ▒░▒░▒░ ░ ▒░   ░  ░░▒▓███▀▒░▓   ▒▒   ▓▒█░
# ▒░▒   ░   ▒   ▒▒ ░░ ░░   ░ ▒░  ░  ▒     ░ ▒ ▒░ ░ ░ ▒  ░  ░ ▒ ▒░ ░  ░      ░▒░▒   ░  ▒ ░  ▒   ▒▒ ░
#  ░    ░   ░   ▒      ░   ░ ░ ░        ░ ░ ░ ▒    ░ ░   ░ ░ ░ ▒  ░      ░    ░    ░  ▒ ░  ░   ▒   
#  ░            ░  ░         ░ ░ ░          ░ ░      ░  ░    ░ ░         ░    ░       ░        ░  ░
#       ░                      ░                                                   ░               

In [2]:
import pandas as pd

In [3]:
# Reading the file with specified encoding.
df = pd.read_csv('/Users/alejandro/Desktop/Life/Apps/Card-Fraud-Detection/cfd/dataset.csv', encoding='utf-8')
# Printing the first few rows.
print(df.head())

                                                 URL
0         ftp://188.128.111.33/IPTV/TV1324/view.html
1                   ftp://188.128.111.33/web/sec.htm
2  ftp://me@createkindlebooks.org:Noobasshole@cre...
3  http://000000000000000000000000000000000000000...
4  http://00000000000000000000000000000000000000d...


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Tokenizer
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
# Stemmer
# stemmer = SnowballStemmer('english')
stemmer = PorterStemmer()
# CountVectorizer
cv = CountVectorizer(max_features=1000)

In [5]:
# Ensure all values in 'URL' column are strings.
df['URL'] = df['URL'].fillna('').astype(str)

In [6]:
# Extracting features from the URL
def extract_features(df):
    df['url_length'] = df['URL'].apply(len)
    df['num_subdomains'] = df['URL'].apply(lambda x: x.count('.'))
    df['num_special_chars'] = df['URL'].apply(lambda x: sum([1 for char in x if not char.isalnum()]))
    # Add other relevant features
    return df

In [7]:
df = extract_features(df)
df.head()

Unnamed: 0,URL,url_length,num_subdomains,num_special_chars
0,ftp://188.128.111.33/IPTV/TV1324/view.html,42,4,10
1,ftp://188.128.111.33/web/sec.htm,32,4,9
2,ftp://me@createkindlebooks.org:Noobasshole@cre...,75,3,10
3,http://000000000000000000000000000000000000000...,52,1,4
4,http://00000000000000000000000000000000000000d...,167,8,28


In [8]:
def prepare_data(X):
    # Tokenize the text.
    X['text_tokenized'] = X['URL'].map(lambda t: tokenizer.tokenize(t)) 
    # Stem the text.
    X['text_stemmed'] = X['text_tokenized'].map(lambda t: [stemmer.stem(word) for word in t])
    # Join the text.
    X['text_sent'] = X['text_stemmed'].map(lambda t: ' '.join(t))
    # Vectorize the text.
    features = cv.fit_transform(X['text_sent'])
    return X, features

In [9]:
X, features = prepare_data(df)
print(X.head())

                                                 URL  url_length  \
0         ftp://188.128.111.33/IPTV/TV1324/view.html          42   
1                   ftp://188.128.111.33/web/sec.htm          32   
2  ftp://me@createkindlebooks.org:Noobasshole@cre...          75   
3  http://000000000000000000000000000000000000000...          52   
4  http://00000000000000000000000000000000000000d...         167   

   num_subdomains  num_special_chars  \
0               4                 10   
1               4                  9   
2               3                 10   
3               1                  4   
4               8                 28   

                                      text_tokenized  \
0                        [ftp, IPTV, TV, view, html]   
1                               [ftp, web, sec, htm]   
2  [ftp, me, createkindlebooks, org, Noobasshole,...   
3                                        [http, xyz]   
4  [http, dfjjjhv, webhostapp, com, Yahoo, YahooA...   

             

In [10]:
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test = train_test_split(features, test_size=.9, random_state=42)

In [11]:
X_train.shape

(409819, 1000)

In [12]:
from sklearn.svm import OneClassSVM

# Train a One-Class SVM
clf = OneClassSVM(nu=0.1, kernel="rbf", gamma=0.01)
clf.fit(X_train)

In [13]:
# Predict on the test set
y_test_pred = clf.predict(X_test)

In [None]:
# Evaluate the model
print(classification_report(y_true, y_test_pred))
print(confusion_matrix(y_true, y_test_pred))