In [1]:
#  ▄▄▄▄    ▄▄▄       ███▄    █  ▄████▄   ▒█████   ██▓     ▒█████   ███▄ ▄███▓ ▄▄▄▄    ██▓ ▄▄▄      
# ▓█████▄ ▒████▄     ██ ▀█   █ ▒██▀ ▀█  ▒██▒  ██▒▓██▒    ▒██▒  ██▒▓██▒▀█▀ ██▒▓█████▄ ▓██▒▒████▄    
# ▒██▒ ▄██▒██  ▀█▄  ▓██  ▀█ ██▒▒▓█    ▄ ▒██░  ██▒▒██░    ▒██░  ██▒▓██    ▓██░▒██▒ ▄██▒██▒▒██  ▀█▄  
# ▒██░█▀  ░██▄▄▄▄██ ▓██▒  ▐▌██▒▒▓▓▄ ▄██▒▒██   ██░▒██░    ▒██   ██░▒██    ▒██ ▒██░█▀  ░██░░██▄▄▄▄██ 
# ░▓█  ▀█▓ ▓█   ▓██▒▒██░   ▓██░▒ ▓███▀ ░░ ████▓▒░░██████▒░ ████▓▒░▒██▒   ░██▒░▓█  ▀█▓░██░ ▓█   ▓██▒
# ░▒▓███▀▒ ▒▒   ▓▒█░░ ▒░   ▒ ▒ ░ ░▒ ▒  ░░ ▒░▒░▒░ ░ ▒░▓  ░░ ▒░▒░▒░ ░ ▒░   ░  ░░▒▓███▀▒░▓   ▒▒   ▓▒█░
# ▒░▒   ░   ▒   ▒▒ ░░ ░░   ░ ▒░  ░  ▒     ░ ▒ ▒░ ░ ░ ▒  ░  ░ ▒ ▒░ ░  ░      ░▒░▒   ░  ▒ ░  ▒   ▒▒ ░
#  ░    ░   ░   ▒      ░   ░ ░ ░        ░ ░ ░ ▒    ░ ░   ░ ░ ░ ▒  ░      ░    ░    ░  ▒ ░  ░   ▒   
#  ░            ░  ░         ░ ░ ░          ░ ░      ░  ░    ░ ░         ░    ░       ░        ░  ░
#       ░                      ░                                                   ░               

In [2]:
import pandas as pd

In [3]:
# Calculate the number of rows in the file
num_rows = sum(1 for line in open('/Users/alejandro/Desktop/Life/Apps/Card-Fraud-Detection/cfd/dataset.csv')) - 1  # subtracting 1 to exclude the header
# Read 10% of the rows, skipping 90% of them
skip_count = int(0.9 * num_rows)
df = pd.read_csv('/Users/alejandro/Desktop/Life/Apps/Card-Fraud-Detection/cfd/dataset.csv', encoding='utf-8', skiprows=lambda i: i > 0 and i <= skip_count)
# Printing the first few rows.
print(df.head())

                               URL
0  https://fashostac.com/index.php
1   https://fasi.ci/Drive/onedrive
2   https://faslxddfsw.duckdns.org
3     https://fasmr.ro/.tmb/leader
4   https://faso-sante.com/secured


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Tokenizer
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
# Stemmer
# stemmer = SnowballStemmer('english')
stemmer = PorterStemmer()
# CountVectorizer
cv = CountVectorizer(max_features=1000)

In [5]:
# Ensure all values in 'URL' column are strings.
df['URL'] = df['URL'].fillna('').astype(str)

In [6]:
# Extracting features from the URL
def extract_features(df):
    df['url_length'] = df['URL'].apply(len)
    df['num_subdomains'] = df['URL'].apply(lambda x: x.count('.'))
    df['num_special_chars'] = df['URL'].apply(lambda x: sum([1 for char in x if not char.isalnum()]))
    # Add other relevant features
    return df

In [7]:
df = extract_features(df)
df.head()

Unnamed: 0,URL,url_length,num_subdomains,num_special_chars
0,https://fashostac.com/index.php,31,2,6
1,https://fasi.ci/Drive/onedrive,30,1,6
2,https://faslxddfsw.duckdns.org,30,2,5
3,https://fasmr.ro/.tmb/leader,28,2,7
4,https://faso-sante.com/secured,30,1,6


In [8]:
def prepare_data(X):
    # Tokenize the text.
    X['text_tokenized'] = X['URL'].map(lambda t: tokenizer.tokenize(t)) 
    # Stem the text.
    X['text_stemmed'] = X['text_tokenized'].map(lambda t: [stemmer.stem(word) for word in t])
    # Join the text.
    X['text_sent'] = X['text_stemmed'].map(lambda t: ' '.join(t))
    # Vectorize the text.
    features = cv.fit_transform(X['text_sent'])
    return X, features

In [9]:
X, features = prepare_data(df)
print(X.head())

                               URL  url_length  num_subdomains  \
0  https://fashostac.com/index.php          31               2   
1   https://fasi.ci/Drive/onedrive          30               1   
2   https://faslxddfsw.duckdns.org          30               2   
3     https://fasmr.ro/.tmb/leader          28               2   
4   https://faso-sante.com/secured          30               1   

   num_special_chars                       text_tokenized  \
0                  6  [https, fashostac, com, index, php]   
1                  6   [https, fasi, ci, Drive, onedrive]   
2                  5    [https, faslxddfsw, duckdns, org]   
3                  7      [https, fasmr, ro, tmb, leader]   
4                  6   [https, faso, sante, com, secured]   

                         text_stemmed                     text_sent  
0  [http, fashostac, com, index, php]  http fashostac com index php  
1      [http, fasi, ci, drive, onedr]      http fasi ci drive onedr  
2     [http, faslxddfsw, d

In [10]:
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test = train_test_split(features, test_size=.9, random_state=42)

In [11]:
X_train.shape

(40981, 1000)

In [12]:
from sklearn.svm import OneClassSVM

# Train a One-Class SVM
clf = OneClassSVM(nu=0.1, kernel="rbf", gamma=0.01)
clf.fit(X_train)

In [13]:
# Predict on the test set
y_test_pred = clf.predict(X_test)

In [15]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate the model
print(classification_report(y_true, y_test_pred))
print(confusion_matrix(y_true, y_test_pred))

NameError: name 'classification_report' is not defined