**Remarks**

We will learn more about phishing detection or act of defrauding an online user in order to obtain personal information by posing as a trustworthy institution or entity.

In [2]:
# data preparation
import pandas as pd
import zipfile
with zipfile.ZipFile('../content/phishing.zip', 'r') as z:
  f = z.open('phishing.csv')
  data = pd.read_csv(f, index_col=False)

In [3]:
# show the data
data.head()

Unnamed: 0,url,phishing
0,http://www.subalipack.com/contact/images/sampl...,1
1,http://fasc.maximecapellot-gypsyjazz-ensemble....,1
2,http://theotheragency.com/confirmer/confirmer-...,1
3,http://aaalandscaping.com/components/com_smart...,1
4,http://paypal.com.confirm-key-21107316126168.s...,1


In [4]:
# count data
print("Number of phishing (1) or not (0):\n{}".format(data.phishing.value_counts()))

Number of phishing (1) or not (0):
1    20000
0    20000
Name: phishing, dtype: int64


In [5]:
# create features data
data.url[data.phishing==1].sample(50, random_state=1).tolist()

['http://dothan.com.co/gold/austspark/index.htm\n',
 'http://78.142.63.63/%7Enetsysco/process/fc1d9c7ea4773b7ff90925c2902cb5f2\n',
 'http://verify95.5gbfree.com/coverme2010/\n',
 'http://www.racom.com/uploads/productscat/bookmark/ii.php?.rand=13vqcr8bp0gud&cbcxt=mai&email=abuse@tradinghouse.ca\n',
 'http://www.cleanenergytci.com/components/update.logon.l3an7lofamerica/2342343234532534546347677898765432876543345687656543876/\n',
 'http://209.148.89.163/-/santander.co.uk/weblegn/AccountLogin.php\n',
 'http://senevi.com/confirmation/\n',
 'http://www.hellenkeller.cl/tmp/new/noticias/Modulo_de_Atualizacao_Bradesco/index2.php?id=PSO1AM04L3Q6PSBNVJ82QUCO0L5GBSY2KM2U9BYUEO14HCRDVZEMTRB3DGJO9HPT4ROC4M8HA8LRJD5FCJ27AD0NTSC3A3VDUJQX6XFG519OED4RW6Y8J8VC19EAAAO5UF21CHGHIP7W4AO1GM8ZU4BUBQ6L2UQVARVM\n',
 'http://internet-sicherheit.co/de/konflikt/src%3Dde/AZ00276ZZ75/we%3Dhs_0_2/sicherheit/konto_verifizieren/verifizierung.php\n',
 'http://alen.co/docs/cleaner\n',
 'http://rattanhouse.co/Atualizacao_

We can conclude that **features** from the data above, "https", "login", ".php", ".html", "@", "sign".

In [6]:
keywords = ["https", "login", ".php", ".html", "@", "sign"]

In [7]:
for keyword in keywords:
  data['keyword_' + keyword] = data.url.str.contains(keyword).astype(int)

In [8]:
data['length'] = data.url.str.len() - 2
domain = data.url.str.split('/', expand=True).iloc[:,2]
data['length_domain'] = domain.str.len()

In [9]:
domain.head(5)

0                                   www.subalipack.com
1            fasc.maximecapellot-gypsyjazz-ensemble.nl
2                                   theotheragency.com
3                                   aaalandscaping.com
4    paypal.com.confirm-key-21107316126168.securepp...
Name: 2, dtype: object

In [10]:
data['IPonly'] = (domain.str.replace('.', '') * 1).str.isnumeric().astype(int)
data['count_com'] = data.url.str.count('com')

In [11]:
# retrieve sample data
data.sample(5, random_state=42)

Unnamed: 0,url,phishing,keyword_https,keyword_login,keyword_.php,keyword_.html,keyword_@,keyword_sign,length,length_domain,IPonly,count_com
32823,http://gamvis.blogspot.com/2009_12_20_archive....,0,0,0,0,1,0,0,49,19,0,1
16298,http://oseethiopia.com/components/com_weblinks...,1,0,0,0,0,0,0,92,15,0,3
28505,http://kooba.com/store/sale.html?color=86&styl...,0,0,0,0,1,0,0,49,9,0,1
6689,http://lnk.co/IULO67I67?fregrtuh?rthtyj?cesare...,1,0,0,0,0,1,0,62,6,0,0
26893,http://www.richardsonrfpd.com/Pages/Product-De...,0,0,0,0,0,0,0,72,22,0,1


We build **the model** based on data.

In [12]:
X = data.drop(["url", "phishing"], axis=1)
y = data.phishing

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# prepare model
clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
cross_val_score(clf, X, y, cv=10)
# fitting data
clf.fit(X,y)

# saving model
from sklearn.externals import joblib
joblib.dump(clf, '../content/phishing_model.pkl', compress=3)



['../content/phishing_model.pkl']

We create model in **API**.

In [17]:
# install flask
!pip install flask-restplus

In [21]:
from flask import Flask
import werkzeug
werkzeug.cached_property = werkzeug.utils.cached_property
from flask_restplus import Api, Resource, fields
from sklearn.externals import joblib
import pandas as pd

In [23]:
# create API
app = Flask(__name__)

api = Api(
    app,
    version='1.0',
    title='Phishing Prediction API',
    description='Phishing Prediction API'
)

ns = api.namespace('predict',
                   description='Phishing Classifier')

parser = api.parser()

parser.add_argument(
    'URL',
    type=str,
    required=True, 
    help='URL to be analyzed',
    location='args'
)

resource_fields = api.model('Resource', {
    'result': fields.String,
})

In [24]:
# load model to FLASK
clf = joblib.load('../content/phishing_model.pkl')

@ns.route('/')
class PhishingApi(Resource):

  @api.doc(parser=parser)
  @api.marshal_with(resource_fields)
  def get(self):
    args = parser.parse_args()
    result = self.predict_proba(args)
    return result, 200
  
  def predict_proba(self, args):
    url = args["URL"]
    url_ = pd.DataFrame([url], columns=['url'])

    # create features
    keywords = ['https', 'login', '.php', '.html', '@', 'sign']
    for keyword in keywords:
      url_['keyword_' + keyword] = url_.url.str.contains(keyword).astype(int)
    
    url_['lenght'] = url_.url.str.len() - 2
    domain = url_.url.str.split('/', expand=True).iloc[:, 2]
    url_['lenght_domain'] = domain.str.len()
    url_['isIP'] = (url_.url.str.replace('.', '') * 1).str.isnumeric().astype(int)
    url_['count_com'] = url_.url.str.count('com')

    # create prediction
    pred = clf.predict_proba(url_.drop('url', axis=1))[0,1]

    print('url=', url, '| prediction = ', pred)

    return {
        "result " : pred
    }

In [26]:
# running API
app.run(debug=True, use_reloader=False, host='0.0.0.0', port=5000)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on


 * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)
