In [27]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
import pandas as pd
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import dill

<h1> Retraining the model and integrating it into a pipeline for easy deployment</h1>

In [2]:
goodFeat = pd.read_csv("good_website_featues18000.csv")
badFeat = pd.read_csv("bad_website_features15000.csv")
badFeat['TargetScore'] = 0
goodFeat['TargetScore'] = 1
X = pd.concat([badFeat,goodFeat],ignore_index=True)
X = X.sample(frac=1).reset_index(drop = True)
y = X['TargetScore']

In [3]:
X.head()

Unnamed: 0.1,Unnamed: 0,urlLength,urlDepth,isShort,hasIP,hasEmail,protocol,misleadingChars,domain,domainAge,registrar,sslAge,PageRank,@present,//present,URL,TargetScore
0,11165,28,-1,,False,False,https,False,www.pocketoption.com,2918.0,"GoDaddy.com, LLC",330.0,147265.0,0,1,https://www.pocketoption.com,1
1,5312,61,3,,False,False,https,False,facebookpaylinks.serveusers.com,8444.0,PDR Ltd. d/b/a PublicDomainRegistry.com,,,0,3,https://facebookpaylinks.serveusers.com//faceb...,0
2,1501,23,-1,,False,False,https,False,www.24xxx-x.com,672.0,Danesco Trading Ltd.,22.0,88469506.0,0,1,https://www.24xxx-x.com,1
3,6492,28,-1,,False,False,https,False,www.qatarairways.com,9917.0,"GoDaddy.com, LLC",155.0,14624.0,0,1,https://www.qatarairways.com,1
4,4043,22,-1,,False,False,https,False,www.chosun.com,10507.0,Whois Corp.,4.0,39187.0,0,1,https://www.chosun.com,1


In [4]:
X.drop(['Unnamed: 0','URL','TargetScore','isShort','hasIP','hasEmail','misleadingChars','//present','@present'],axis=1,inplace=True)

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np

# Custom transformer to fill missing values
class FillMissingValues(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.sslMean = 95.42572488662911 # SSL age mean from the training dataset
        self.domainAgeMean = 4768.3417287504235 # Domain Age mean from the training dataset

    def fit(self, X, y=None):
        self.sslMean = X['sslAge'].mean() if 'sslAge' in X else self.sslMean
        self.domainAgeMean = X['domainAge'].mean() if 'domainAge' in X else self.domainAgeMean
        return self

    def transform(self, X):
        X = X.copy()
        X['PageRank'].fillna(10000000, inplace=True)
        X['registrar'].fillna("", inplace=True)
        X['sslAge'].fillna(self.sslMean, inplace=True)
        X['domainAge'].fillna(self.domainAgeMean, inplace=True)
        X['domain'].fillna("", inplace=True)
        return X

# Custom transformer for domain extraction
class ExtractDomain(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['domain'] = X['domain'].apply(lambda url: url.split('.')[-1] if len(url.split('.')) > 2 else 'None')
        return X

# Custom transformer to encode categorical features using the fitted OrdinalEncoder
class OrdinalEncoderWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, encoded_columns):
        self.encoded_columns = encoded_columns
        self.encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)  # Create a new encoder instance

    def fit(self, X, y=None):
        self.encoder.fit(X[self.encoded_columns])  # Fit the encoder on specific columns
        return self

    def transform(self, X):
        X = X.copy()
        X[self.encoded_columns] = self.encoder.transform(X[self.encoded_columns])  # Transform specific columns
        return X

# Define the columns to be preprocessed
encoded_columns = ['registrar', 'domain', 'protocol']

# Preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=[
    ('fill_missing', FillMissingValues()),
    ('extract_domain', ExtractDomain()),
    ('encode', OrdinalEncoderWrapper(encoded_columns=encoded_columns))
])

In [7]:
preprocessing_pipeline.fit(X_train)

In [8]:
X_train = preprocessing_pipeline.transform(X_train)

<h1>Using Isolation forest to remove outliers from the training data</h1>

In [9]:
# Outlier detection using IQR
from sklearn.ensemble import IsolationForest
X_train_IF = X_train.copy()
model = IsolationForest() 
model.fit(X_train_IF)

In [10]:
scores=model.decision_function(X_train_IF)
anomaly=model.predict(X_train_IF)

X_train_IF['scores']=scores
X_train_IF['anomaly']=anomaly

anomaly = X_train_IF.loc[X_train_IF['anomaly']==-1]
anomaly_index = list(anomaly.index)
print('Total number of outliers is:', len(anomaly))

Total number of outliers is: 3981


In [11]:
X_train = X_train_IF.drop(anomaly_index, axis = 0).reset_index(drop=True)
y_train = y_train.drop(anomaly_index,axis = 0).reset_index(drop=True)

In [12]:
X_train.drop(['scores','anomaly'],axis=1,inplace=True)
X_train.head()

Unnamed: 0,urlLength,urlDepth,protocol,domain,domainAge,registrar,sslAge,PageRank
0,63,2,1.0,103.0,4789.194584,0.0,29.0,10000000.0
1,38,0,1.0,23.0,4789.194584,0.0,31.0,10000000.0
2,24,-1,1.0,124.0,6237.0,493.0,25.0,24449.0
3,35,0,1.0,169.0,4086.0,406.0,229.0,37420719.0
4,23,-1,1.0,230.0,4799.0,406.0,95.377601,33968554.0


In [13]:
X_train.to_csv("X_train.csv")
y_train.to_csv("y_train.csv")

In [14]:
xgb = joblib.load("./XGBModel.pkl")

In [28]:
def save_transformer(transformer, filename):
    with open(filename, 'wb') as file:
        dill.dump(transformer, file)

In [15]:
# Full pipeline including the pre-trained model (assuming xgb is defined)
full_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing_pipeline),
    ('model', xgb)
])
# Save the complete pipeline to a file
save_transformer(full_pipeline,'full_model_pipeline.pkl')

['full_model_pipeline.pkl']

In [17]:
predictions = full_pipeline.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score
accuracy_score(predictions,y_test)

0.9944071588366891