In [1]:
#  ▄▄▄▄    ▄▄▄       ███▄    █  ▄████▄   ▒█████   ██▓     ▒█████   ███▄ ▄███▓ ▄▄▄▄    ██▓ ▄▄▄      
# ▓█████▄ ▒████▄     ██ ▀█   █ ▒██▀ ▀█  ▒██▒  ██▒▓██▒    ▒██▒  ██▒▓██▒▀█▀ ██▒▓█████▄ ▓██▒▒████▄    
# ▒██▒ ▄██▒██  ▀█▄  ▓██  ▀█ ██▒▒▓█    ▄ ▒██░  ██▒▒██░    ▒██░  ██▒▓██    ▓██░▒██▒ ▄██▒██▒▒██  ▀█▄  
# ▒██░█▀  ░██▄▄▄▄██ ▓██▒  ▐▌██▒▒▓▓▄ ▄██▒▒██   ██░▒██░    ▒██   ██░▒██    ▒██ ▒██░█▀  ░██░░██▄▄▄▄██ 
# ░▓█  ▀█▓ ▓█   ▓██▒▒██░   ▓██░▒ ▓███▀ ░░ ████▓▒░░██████▒░ ████▓▒░▒██▒   ░██▒░▓█  ▀█▓░██░ ▓█   ▓██▒
# ░▒▓███▀▒ ▒▒   ▓▒█░░ ▒░   ▒ ▒ ░ ░▒ ▒  ░░ ▒░▒░▒░ ░ ▒░▓  ░░ ▒░▒░▒░ ░ ▒░   ░  ░░▒▓███▀▒░▓   ▒▒   ▓▒█░
# ▒░▒   ░   ▒   ▒▒ ░░ ░░   ░ ▒░  ░  ▒     ░ ▒ ▒░ ░ ░ ▒  ░  ░ ▒ ▒░ ░  ░      ░▒░▒   ░  ▒ ░  ▒   ▒▒ ░
#  ░    ░   ░   ▒      ░   ░ ░ ░        ░ ░ ░ ▒    ░ ░   ░ ░ ░ ▒  ░      ░    ░    ░  ▒ ░  ░   ▒   
#  ░            ░  ░         ░ ░ ░          ░ ░      ░  ░    ░ ░         ░    ░       ░        ░  ░
#       ░                      ░                                                   ░               

In [2]:
import pandas as pd

In [3]:
# Calculate the number of rows in the file
num_rows = sum(1 for line in open('/Users/alejandro/Desktop/Life/Apps/Card-Fraud-Detection/cfd/dataset.csv')) - 1  # subtracting 1 to exclude the header
# Read 10% of the rows, skipping 90% of them
skip_count = int(0.9 * num_rows)
df = pd.read_csv('/Users/alejandro/Desktop/Life/Apps/Card-Fraud-Detection/cfd/dataset.csv', encoding='utf-8', skiprows=lambda i: i > 0 and i <= skip_count)
# Printing the first few rows.
print(df.head())

                               URL
0  https://fashostac.com/index.php
1   https://fasi.ci/Drive/onedrive
2   https://faslxddfsw.duckdns.org
3     https://fasmr.ro/.tmb/leader
4   https://faso-sante.com/secured


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Tokenizer
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
# Stemmer
# stemmer = SnowballStemmer('english')
stemmer = PorterStemmer()
# CountVectorizer
cv = CountVectorizer(max_features=1000)

In [5]:
# Ensure all values in 'URL' column are strings.
df['URL'] = df['URL'].fillna('').astype(str)

In [6]:
# Extracting features from the URL
def extract_features(df):
    df['url_length'] = df['URL'].apply(len)
    df['num_subdomains'] = df['URL'].apply(lambda x: x.count('.'))
    df['num_special_chars'] = df['URL'].apply(lambda x: sum([1 for char in x if not char.isalnum()]))
    # Add other relevant features
    return df

In [7]:
df = extract_features(df)
df.head()

Unnamed: 0,URL,url_length,num_subdomains,num_special_chars
0,https://fashostac.com/index.php,31,2,6
1,https://fasi.ci/Drive/onedrive,30,1,6
2,https://faslxddfsw.duckdns.org,30,2,5
3,https://fasmr.ro/.tmb/leader,28,2,7
4,https://faso-sante.com/secured,30,1,6


In [8]:
# Extract features from URL
def using_ip(url):
    # Check if URL contains an IP address
    return 1 if any(char.isdigit() for char in url.split('//')[-1].split('/')[0]) else 0

def long_url(url):
    # Check the length of the URL
    return 1 if len(url) < 54 else 2 if len(url) < 75 else 0

def short_url(url):
    # Check for URL shortening services
    shorteners = ['bit.ly', 'goo.gl', 'tinyurl.com', 't.co']
    return 1 if any(shortener in url for shortener in shorteners) else 0

def symbol_at(url):
    # Check if URL contains '@'
    return 1 if '@' in url else 0

def redirecting(url):
    # Check if URL has '//' after the protocol
    return 1 if '//' in url.split('://')[-1] else 0

def https(url):
    # Check if URL uses HTTPS
    return 1 if 'https://' in url else 2 if 'http://' in url else 0

def sub_domains(url):
    # Count the number of subdomains
    return url.split('://')[-1].count('.')

# Apply functions to the URL column
df['UsingIP'] = df['URL'].apply(using_ip)
df['LongURL'] = df['URL'].apply(long_url)
df['ShortURL'] = df['URL'].apply(short_url)
df['Symbol@'] = df['URL'].apply(symbol_at)
df['Redirecting//'] = df['URL'].apply(redirecting)
df['HTTPS'] = df['URL'].apply(https)
df['SubDomains'] = df['URL'].apply(sub_domains)

In [9]:
def prefix_suffix(url):
    # Check if domain has '-'
    domain = url.split('://')[-1].split('/')[0]
    return 1 if '-' in domain else 0

def domain_reg_len(url):
    # Check the length of the domain name
    domain = url.split('://')[-1].split('/')[0]
    return 1 if len(domain) <= 6 else 0

def non_std_port(url):
    # Check for non-standard ports
    if ":80" in url or ":443" in url:
        return 2
    elif ":" in url.split('://')[-1]:
        return 1
    return 0

def https_domain_url(url):
    # Check if domain starts with "https"
    domain = url.split('://')[-1].split('/')[0]
    return 1 if domain.startswith("https") else 0

def request_url(url):
    # Basic check for ".js" in the URL
    return 1 if ".js" in url else 0

def anchor_url(url):
    # Check for anchor '#' in the URL
    return 1 if '#' in url else 0

def info_email(url):
    # Check for "mailto:"
    return 1 if "mailto:" in url else 0

def abnormal_url(url):
    # Basic check if domain is in the URL
    domain = url.split('://')[-1].split('/')[0]
    return 1 if domain in url else 0

# Apply functions to the URL column
df['PrefixSuffix-'] = df['URL'].apply(prefix_suffix)
df['DomainRegLen'] = df['URL'].apply(domain_reg_len)
df['NonStdPort'] = df['URL'].apply(non_std_port)
df['HTTPSDomainURL'] = df['URL'].apply(https_domain_url)
df['RequestURL'] = df['URL'].apply(request_url)
df['AnchorURL'] = df['URL'].apply(anchor_url)
df['InfoEmail'] = df['URL'].apply(info_email)
df['AbnormalURL'] = df['URL'].apply(abnormal_url)

In [10]:
def iframe_redirection(url):
    # Basic check for "iframe" in the URL
    return 1 if "iframe" in url else 0

def google_index(url):
    # Basic check for "google" in the URL
    return 1 if "google" in url else 0

def stats_report(url):
    # Check for common stats reporting paths
    return 1 if "/stats" in url or "/report" in url else 0

# Apply functions to the URL column
df['IframeRedirection'] = df['URL'].apply(iframe_redirection)
df['GoogleIndex'] = df['URL'].apply(google_index)
df['StatsReport'] = df['URL'].apply(stats_report)

In [11]:
df.head()

Unnamed: 0,URL,url_length,num_subdomains,num_special_chars,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,HTTPS,...,DomainRegLen,NonStdPort,HTTPSDomainURL,RequestURL,AnchorURL,InfoEmail,AbnormalURL,IframeRedirection,GoogleIndex,StatsReport
0,https://fashostac.com/index.php,31,2,6,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,https://fasi.ci/Drive/onedrive,30,1,6,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,https://faslxddfsw.duckdns.org,30,2,5,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,https://fasmr.ro/.tmb/leader,28,2,7,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,https://faso-sante.com/secured,30,1,6,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [12]:
def prepare_data(X):
    # Tokenize the text.
    X['text_tokenized'] = X['URL'].map(lambda t: tokenizer.tokenize(t)) 
    # Stem the text.
    X['text_stemmed'] = X['text_tokenized'].map(lambda t: [stemmer.stem(word) for word in t])
    # Join the text.
    X['text_sent'] = X['text_stemmed'].map(lambda t: ' '.join(t))
    # Vectorize the text.
    # features = cv.fit_transform(X['text_sent'])
    return X

In [13]:
def hash_url(url):
    return hash(url)

# Apply the hash function to the URL column in both training and testing sets
df['URL'] = df['URL'].apply(hash_url)

In [14]:
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

In [15]:
from sklearn.svm import OneClassSVM

# Define the model. The 'nu' parameter is an upper bound on the fraction of margin errors and a lower bound of the fraction of support vectors.
# 'rbf' is a common choice of kernel for one-class SVM.
one_class_svm = OneClassSVM(kernel='rbf', nu=0.1)

# Train the model using only the 'normal' data
one_class_svm.fit(X_train)

In [16]:
y_pred = one_class_svm.predict(X_test)

In [17]:
import joblib

# Save the model.
joblib.dump(one_class_svm, 'cfd/app/svm.pkl')

['cfd/app/svm.pkl']