In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('dataset.csv', encoding='ISO_8859_1')

In [3]:
df

Unnamed: 0,index,having_IPhaving_IP_Address,URLURL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,1,0,1,1,1,0,0,0,0,0,...,1,1,0,0,0,0,1,1,0,0
1,2,1,1,1,1,1,0,-1,1,0,...,1,1,0,0,-1,0,1,1,1,0
2,3,1,-1,1,1,1,0,0,0,0,...,1,1,1,0,1,0,1,-1,0,0
3,4,1,-1,1,1,1,0,0,0,1,...,1,1,0,0,1,0,1,0,1,0
4,5,1,-1,0,1,1,0,1,1,0,...,0,1,0,0,-1,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11050,11-151,1,0,1,0,1,1,1,1,0,...,0,0,1,1,0,0,1,1,1,1
11051,11-15-1,0,1,1,0,0,0,1,0,0,...,0,1,1,1,1,1,1,0,1,0
11052,11-153,1,0,1,1,1,0,1,0,0,...,1,1,1,1,1,0,1,-1,1,0
11053,11-154,0,0,1,1,1,0,0,0,1,...,0,1,1,1,1,0,1,1,1,0


In [5]:
df = df[['index', 'Statistical_report']]

In [6]:
df

Unnamed: 0,index,Statistical_report
0,1,0
1,2,1
2,3,0
3,4,1
4,5,1
...,...,...
11050,11-151,1
11051,11-15-1,1
11052,11-153,1
11053,11-154,1


In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [9]:
class TextNormalizer(BaseEstimator,TransformerMixin ):
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        X_copy = X.copy()
        for i in range(len(X)):
            X_copy[i] = ' '.join([token.lower()
                                 for token in word_tokenize(X_copy[i])])
        return X_copy

In [10]:
norm = TextNormalizer()

In [11]:
norm.fit_transform(["Link, http, https, wwww"])

['link , http , https , wwww']

In [12]:
class WordExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words):
        self.stop_words = stop_words
    
    def fit(self, X, y=None, **fit_params):
        self.general_freq = FreqDist()
        for document in X:
            tokens = word_tokenize(document)
            freq = FreqDist(tokens)
            self.general_freq.update(freq)
        self.hapaxes = self.general_freq.hapaxes()
        return self
    
    def transform(self, X, y=None, **fit_params):
        X_copy = X.copy()
        for i in range(len(X)):
            X_copy[i] = ' '.join([token for token in word_tokenize(X[i])
                                 if token not in self.hapaxes and
                                 token not in self.stop_words])
        return X_copy

In [13]:
stop_words = stopwords.words("english")

In [14]:
word_extractor = WordExtractor(stop_words)

In [15]:
corpus = [
    'Children post a link',
     'I copied the link',
      'Link pasted into browser',
    ]

In [16]:
word_extractor.fit_transform(corpus)

['link', 'link', '']

In [17]:
class ApplyStemmer(BaseEstimator, TransformerMixin):
    def __init__(self, stemmer):
        self.stemmer = stemmer
    
    def fit(self, X, y=None, **fit_tranform):
        return self
    
    def transform(self, X, y=None, **fit_tranform):
        X_copy = X.copy()
        for i in range(len(X)):
            X_copy[i] = ' '.join([self.stemmer.stem(token) 
                                  for token in word_tokenize(X_copy[i])])
        return X_copy

In [21]:
porter_stemmer = PorterStemmer()

In [22]:
apply_stemmer = ApplyStemmer(porter_stemmer)

In [23]:
apply_stemmer.fit_transform(['Children post a link, I copied the link, Link pasted into browser!'])

['children post a link , i copi the link , link past into browser !']

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [25]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [26]:
pipe = Pipeline([
   ("norm", TextNormalizer()),
    ("extractor", WordExtractor(stop_words)),
    ("stemmer", ApplyStemmer(PorterStemmer())),
    ("vectorizer", TfidfVectorizer()),
    ("logic", LogisticRegression(max_iter=1000))
    
])

In [27]:
X = df['index'].values
y = df['Statistical_report'].values 

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [29]:
pipe.fit(X_train, y_train)

In [30]:
y_pred = pipe.predict(X_test)

In [31]:
accuracy_score(y_pred, y_test)

0.8581765557163531

In [32]:
import pickle

In [33]:
pickle.dump(pipe, open("nlp_pipe.pkl",'wb'))