In [1]:
import pandas as pd
import numpy as np
import random

from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
url_data = pd.read_csv("/home/ian/Desktop/123/df.csv",
                           usecols=['url', 'label'], sep='\t', index_col=None)

In [5]:
#url_data['label'].value_counts()

0    1879621
1     552303
Name: label, dtype: int64

In [None]:
from sklearn.utils import shuffle

In [None]:
url_data = shuffle(url_data)

In [3]:
def makeTokens(f):
    tkns_BySlash = str(f.encode('utf-8')).split('/')
    total_Tokens = []
    for i in tkns_BySlash:
        tokens = str(i).split('-')
        tkns_ByDot = []
        for j in range(0, len(tokens)):
            temp_Tokens = str(tokens[j]).split('.')
            tkns_ByDot = tkns_ByDot+temp_Tokens
        total_Tokens = total_Tokens+tokens+tkns_ByDot
    total_Tokens = list(set(total_Tokens))
    if 'com' in total_Tokens:
        total_Tokens.remove('com')
    return total_Tokens

In [4]:
# Label
y = url_data["label"]

In [5]:
# Features
url_list = url_data["url"]

In [6]:
vectorizer = TfidfVectorizer(tokenizer=makeTokens)

In [7]:
X = vectorizer.fit_transform(url_list)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic

In [9]:
logit = LogisticRegression()
logit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
print("Accuracy ", logit.score(X_test, y_test))

Accuracy  0.8952373120059213


# f1 score

In [13]:
metrics.f1_score(y_test,logit.predict(X_test))

0.7198812566999259

# Logistic AUC

In [10]:
fpr, tpr, thresholds = metrics.roc_curve(
    y_test, logit.decision_function(X_test), pos_label=1)

In [11]:
metrics.auc(fpr,tpr)

0.9212570006887442

In [13]:
logit.decision_function(X_test)

array([-0.88766988, -5.56868278,  3.84337513, ..., -2.26622087,
        0.22049811, -4.17201592])

# Another way to get Logistic AUC

In [17]:
metrics.roc_auc_score( y_test, logit.decision_function(X_test))

0.9212570006887442

# Logistic Cross_validation

In [None]:
from sklearn.cross_validation import cross_val_score

print(cross_val_score(logit, X, y,cv=10).mean())

# SGD

In [18]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(max_iter=10)
sgd.fit(X_train, y_train)
sgd.score(X_test, y_test)

0.8690954696382496

# SGD AUC

In [20]:
fpr, tpr, thresholds = metrics.roc_curve(
    y_test, sgd.decision_function(X_test), pos_label=1)

In [21]:
metrics.auc(fpr,tpr)

0.8883135344782164

# Another way to get SGD AUC

In [19]:
metrics.roc_auc_score(y_test, sgd.decision_function(X_test))

0.8883135344782164

# SGD Cross_Validation

In [None]:
cross_val_score(sgd, X, y,cv=10).mean()

# SVM

In [None]:
import multiprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

svc =SVC()

svc.fit(X_train,y_train)
svc.score(X_test,y_test)
param_grid = [
    {
        'kernel': ['linear'],
        'C':[0.1, 0.2, 0.4, 0.5, 1.0, 1.5, 1.8, 2.0, 2.5, 3.0]
    }
]

gs = GridSearchCV(estimator=SVC(),
                  param_grid=param_grid,
                  scoring='accuracy',n_jobs=multiprocessing.cpu_count())

gs.fit(X,y)

# Perceptron

In [22]:
from sklearn.linear_model import Perceptron

perc=Perceptron(max_iter=10)
perc.fit(X_train, y_train)
perc.score(X_test, y_test)

0.875754803293687

# Perceptron AUC

In [23]:
fpr, tpr, thresholds = metrics.roc_curve(
    y_test, perc.decision_function(X_test), pos_label=1)

In [24]:
metrics.auc(fpr,tpr)

0.8852826388188981

# Another way to get Perceptron AUC

In [26]:
metrics.roc_auc_score(y_test, perc.decision_function(X_test))

0.8852826388188981

# Perceptron Cross_validation

In [None]:
cross_val_score(perc,X,y,scoring='accuracy',cv=10).mean()

# Multi-layer Perceptron classifier

In [25]:
#from sklearn.neural_network import MLPClassifier

#mlp = MLPClassifier(solver='sgd')
#mlp.fit(X_train, y_train)
#mlp.score(X_test, y_test)

# Predicting

In [None]:
X_predict = [
    "music.163.com",
    "http://www.glszx.gov.cn/",
    "www.youku.com",
    "https://www.xiami.com/",
    "https://www.taobao.com/",
    "http://www.iqiyi.com/",
    "https://www.meituan.com/",
    "https://www.amazon.cn/",
    "https://v.qq.com/",
    "http://www.ctrip.com/",
    "https://exmail.qq.com/cgi-bin/loginpage?t=dm_loginpage&dmtype=bizmail&s=session_timeout",
    "http://www.sztjj.gov.cn/Info_Detail.asp?id=24260",
    ]

In [None]:
X_predict=vectorizer.transform(X_predict)
New_predict=logit.predict(X_predict)
print(New_predict)

In [None]:
New_predict=sgd.predict(X_predict)
print(New_predict)

In [None]:
New_predict=perc.predict(X_predict)
print(New_predict)

In [None]:
from sklearn.externals import joblib

In [None]:
joblib.dump(logit,'/home/ian/Desktop/url.m')

# Multi_Class

In [26]:
from sklearn import datasets

In [27]:
digits = datasets.load_digits()

In [28]:
x = digits['data']
y = digits['target']

In [29]:
x.shape

(1797, 64)

In [30]:
y.shape

(1797,)

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [33]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [34]:
logit = LogisticRegression()
logit.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [35]:
logit.score(x_test, y_test)

0.9611111111111111

In [36]:
 y_predict = logit.predict(x_test) 

In [37]:
y_predict

array([6, 9, 3, 7, 2, 1, 5, 2, 5, 2, 1, 8, 4, 0, 4, 2, 3, 7, 8, 8, 4, 3,
       9, 7, 5, 6, 3, 5, 6, 3, 4, 9, 1, 4, 4, 6, 9, 4, 7, 6, 6, 9, 1, 3,
       6, 1, 3, 0, 6, 5, 5, 1, 9, 5, 6, 0, 9, 0, 0, 1, 0, 4, 5, 2, 4, 5,
       7, 0, 7, 5, 9, 5, 5, 4, 7, 0, 4, 5, 5, 9, 9, 0, 2, 3, 8, 0, 6, 4,
       4, 9, 1, 2, 8, 3, 5, 2, 9, 0, 4, 4, 4, 3, 5, 3, 1, 3, 5, 9, 4, 2,
       7, 7, 4, 4, 1, 9, 2, 7, 8, 7, 2, 6, 9, 4, 0, 7, 2, 7, 5, 8, 7, 5,
       7, 9, 0, 6, 6, 4, 2, 8, 0, 9, 4, 6, 9, 9, 6, 9, 0, 5, 5, 6, 6, 0,
       6, 4, 3, 9, 3, 8, 7, 2, 9, 0, 6, 5, 3, 6, 5, 8, 9, 8, 4, 2, 1, 3,
       7, 7, 2, 2, 3, 9, 8, 0, 3, 2, 2, 5, 6, 9, 9, 4, 1, 2, 4, 2, 3, 6,
       4, 8, 5, 9, 5, 7, 8, 9, 4, 8, 1, 5, 4, 4, 9, 6, 1, 8, 6, 0, 4, 5,
       2, 7, 1, 6, 4, 5, 6, 0, 3, 2, 3, 6, 7, 1, 5, 1, 4, 7, 6, 5, 8, 5,
       5, 1, 5, 2, 8, 8, 9, 8, 7, 6, 2, 2, 2, 3, 4, 8, 8, 3, 6, 0, 9, 7,
       7, 0, 1, 0, 4, 5, 8, 5, 3, 6, 0, 4, 1, 0, 0, 3, 6, 5, 9, 7, 3, 5,
       5, 9, 9, 8, 5, 3, 3, 2, 0, 5, 8, 3, 4, 0, 2,

In [38]:
y_test

array([6, 9, 3, 7, 2, 1, 5, 2, 5, 2, 1, 9, 4, 0, 4, 2, 3, 7, 8, 8, 4, 3,
       9, 7, 5, 6, 3, 5, 6, 3, 4, 9, 1, 4, 4, 6, 9, 4, 7, 6, 6, 9, 1, 3,
       6, 1, 3, 0, 6, 5, 5, 1, 9, 5, 6, 0, 9, 0, 0, 1, 0, 4, 5, 2, 4, 5,
       7, 0, 7, 5, 9, 5, 5, 4, 7, 0, 4, 5, 5, 9, 9, 0, 2, 3, 8, 0, 6, 4,
       4, 9, 1, 2, 8, 3, 5, 2, 9, 0, 4, 4, 4, 3, 5, 3, 1, 3, 5, 9, 4, 2,
       7, 7, 4, 4, 1, 9, 2, 7, 8, 7, 2, 6, 9, 4, 0, 7, 2, 7, 5, 8, 7, 5,
       7, 7, 0, 6, 6, 4, 2, 8, 0, 9, 4, 6, 9, 9, 6, 9, 0, 3, 5, 6, 6, 0,
       6, 4, 3, 9, 3, 9, 7, 2, 9, 0, 4, 5, 3, 6, 5, 9, 9, 8, 4, 2, 1, 3,
       7, 7, 2, 2, 3, 9, 8, 0, 3, 2, 2, 5, 6, 9, 9, 4, 1, 5, 4, 2, 3, 6,
       4, 8, 5, 9, 5, 7, 8, 9, 4, 8, 1, 5, 4, 4, 9, 6, 1, 8, 6, 0, 4, 5,
       2, 7, 4, 6, 4, 5, 6, 0, 3, 2, 3, 6, 7, 1, 5, 1, 4, 7, 6, 8, 8, 5,
       5, 1, 6, 2, 8, 8, 9, 9, 7, 6, 2, 2, 2, 3, 4, 8, 8, 3, 6, 0, 9, 7,
       7, 0, 1, 0, 4, 5, 1, 5, 3, 6, 0, 4, 1, 0, 0, 3, 6, 5, 9, 7, 3, 5,
       5, 9, 9, 8, 5, 3, 3, 2, 0, 5, 8, 3, 4, 0, 2,

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(solver='adam')
mlp.fit(x_train, y_train)
mlp.score(x_test, y_test)


In [None]:
#mlp.predict(x_test)

In [None]:
#y_test

In [None]:
#from sklearn.metrics.pairwise import cosine_similarity

In [None]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
...                     hidden_layer_sizes=(5, 2), random_state=1)

# pima

In [27]:
import pandas as pd

In [28]:
data = pd.read_csv('/home/ian/Downloads/pima-indians-diabetes.csv', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [29]:
x=data.iloc[:,0:8]
x.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [30]:
y=data.iloc[:,8]
y.head()

0    1
1    0
2    1
3    0
4    1
Name: 8, dtype: int64

In [None]:
from keras.models import Sequential
from keras.layers import Dense

In [None]:
model = Sequential()

model.add(Dense(16, input_dim=8, activation='relu',
                kernel_initializer="uniform"))
model.add(Dense(12, activation='relu', kernel_initializer="uniform"))
model.add(Dense(8, activation='relu', kernel_initializer="uniform"))
model.add(Dense(1, activation='sigmoid', kernel_initializer="uniform"))

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(x, y,epochs=100,batch_size=768)

#loss='categorical_crossentropy'

In [None]:
scores = model.evaluate(x,y)
scores

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=5)

In [32]:
logit = LogisticRegression()
logit.fit(x_train, y_train)
logit.score(x_test,y_test)

0.7878787878787878

In [33]:
fpr, tpr, thresholds = metrics.roc_curve(
    y_test, logit.decision_function(x_test), pos_label=1)

metrics.auc(fpr,tpr)

0.8252640845070424

In [34]:
metrics.roc_auc_score(y_test, logit.decision_function(x_test))

0.8252640845070424

# FastText

In [None]:
import fasttext
import pandas as pd

df = pd.read_csv('./Desktop/123/df.csv',
                 usecols=['description', 'label'], index_col=None)
df['Label'] = df.label.map({1: '__label__1', 0: '__label__0'})
df['df'] = df['Label']+df['description']

test = df.df.iloc[0:250000]
train = df.df.iloc[250000:2431924]

test.to_csv('./Desktop/test.txt', index=None, header=None)
train.to_csv('./Desktop/train.txt', index=None, header=None)

clf = fasttext.supervised('./Desktop/train.txt',
                          'model', label_prefix='__label__')

result =clf.test('./Desktop/test.txt')

result.precision

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('./Desktop/123/df.csv',
                 usecols=['description', 'label'], index_col=None)[['label', 'description']]
df.head()

In [None]:
df.label = df.label.map({1: '__label__1', 0: '__label__0'})
df.head()

# XGboost

In [None]:
import xgboost as xgb

In [None]:
dtrain = xgb.DMatrix(X_train,label=y_train)

In [None]:
dtest = xgb.DMatrix(X_test)

In [None]:
params = {'booster': 'gbtree',
          'objective': 'binary:logistic',
          'eval_metric':'auc',
          'max_depth': 4,
          'lambda': 10,
          'subsample': 0.75,
          'colsample_bytree': 0.75,
          'min_child_weight': 2,
          'eta': 0.025,
          'seed': 0,
          'nthread': 8,
          'silent': 0}

In [None]:
watchlist = [(dtrain, 'train')]

In [None]:
bst = xgb.train(params,dtrain, num_boost_round=200, evals=watchlist)#+early_stopping_rounds=100

In [None]:
ypred = bst.predict(dtest)

In [None]:
y_pred = (ypred >= 0.5)*1

In [None]:
from sklearn import metrics
metrics.precision_score(y_test, y_pred)

In [None]:
X_predict = [
    "music.163.com",
    "http://www.glszx.gov.cn/",
    "www.youku.com",
    "https://www.xiami.com/",
    "https://www.taobao.com/",
    "http://www.iqiyi.com/",
    "https://www.meituan.com/",
    "https://www.amazon.cn/",
    "https://v.qq.com/",
    "http://www.ctrip.com/",
    "https://exmail.qq.com/cgi-bin/loginpage?t=dm_loginpage&dmtype=bizmail&s=session_timeout",
    "http://www.sztjj.gov.cn/Info_Detail.asp?id=24260",
    ]

X_predict=vectorizer.transform(X_predict)

dpredict = xgb.DMatrix(X_predict)

In [None]:
ypred = bst.predict(dpredict)
y_pred = (ypred >= 0.5)*1
y_pred