<h2>URL Analysis Project -- Random Forest </h2>

<h3>Introduction:</h3>

This project is about to analyze web urls to distinguish good and bad ones.

<h3>1.1 Import:</h3><p>

In [1]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import jieba
from nltk.stem import PorterStemmer
import re
from sklearn.model_selection import train_test_split
# from sklearn.externals import joblib
import joblib
from sklearn.preprocessing import StandardScaler
good_dateStr = '20210402'
bad_dateStr = '20210331'
dirPath = 'your/local/directory/path'
good_case = pd.read_csv(dirPath + '/transcoder-us-train/us_case/us_good_case'+good_dateStr+'.txt')
bad_case = pd.read_csv(dirPath + '/transcoder-us-train/us_case/us_bad_case'+bad_dateStr+'.txt')
STOP_WORDS = ['-','/', '.', ':']
ps = PorterStemmer()
pattern = '^.*\\d{5}.*$'
pattern2 = '.*([0-9]{4}[/-]?[0-9]{2}[/-]?[0-9]{2}).*'
patternP = '.*(p=[0-9]{3,6})$'
patternEndNum = '.*/([0-9]{3,})$'
patternEndHTML = '.*(html|htm|php)$'
patternStart = '.*&start=([0-9]{1,3})/?$'
f_names = ['length', 'path_length', 'p_flag', 'start_flag', 'article', 'news_id', 'contains_date','page' , 'end_with_num', 'news', 'cat', 'dot', 'dash', 'slash', 'hash','token_num','end_with_html']

jieba.add_word('category')
jieba.add_word('news')
jieba.add_word('article')
jieba.add_word('page')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/5w/4pf1vf8912l7_xp0jxn1k7s40000gn/T/jieba.cache
Loading model cost 1.672 seconds.
Prefix dict has been built successfully.


In [2]:
def weighted_accuracy(pred, true):
    assert(len(pred) == len(true))
    num_labels = len(true)
    num_pos = sum(true)
    num_neg = num_labels - num_pos
    frac_pos = num_pos/num_labels
    weight_pos = 1/frac_pos
    weight_neg = 1/(1-frac_pos)
    num_pos_correct = 0
    num_neg_correct = 0
    for pred_i, true_i in zip(pred, true):
        num_pos_correct += (pred_i == true_i and true_i == 1)
        num_neg_correct += (pred_i == true_i and true_i == 0)
    weighted_accuracy = ((weight_pos * num_pos_correct) 
                         + (weight_neg * num_neg_correct))/((weight_pos * num_pos) + (weight_neg * num_neg))
    return weighted_accuracy

<h3>1.2 Pre-process data:</h3><p>

In [3]:
def get_features(url, label):
    dict = {}
    path = urlparse(url).path
    dict['url'] = url
    dict['path'] = path
    dict['length'] = len(url)
    dict['path_length'] = len(path)
    url_term = jieba.cut_for_search(url)
    url_term = [x for x in list(url_term) if x not in STOP_WORDS]
    words = []
    for word in url_term:
        word = ps.stem(word)
        words.append(word)
    dict['token_num'] = len(words)
    dict['news_id'] = 1 if re.match(pattern, url) else 0
    dict['cat'] = 1 if 'categori' in url_term else 0
    dict['news'] = 1 if 'news' in url_term else 0
    dict['dot'] = path.count('.')
    dict['dash'] = path.count('-')
    dict['slash'] = path.count('/')
    dict['hash'] = path.count('#')  
    dict['contains_date'] = 1 if re.match(pattern2, url) else 0
    dict['p_flag'] = 1 if re.match(patternP, url) else 0
    dict['article'] = 1 if 'articl' in url_term else 0
    dict['page'] = 1 if 'page' in url_term else 0
    dict['end_with_num'] = 1 if re.match(patternEndNum, url) else 0
    dict['start_flag'] = 1 if re.match(patternStart, url) else 0
    dict['label'] = label
    dict['end_with_html'] = 1 if re.match(patternEndHTML, url) else 0
    return dict

In [4]:
type(STOP_WORDS)

list

In [5]:
good_case.head()

Unnamed: 0,url
0,https://tulsaworld.com/news/local/crime-and-co...
1,https://www.purdueexponent.org/campus/article_...
2,https://thesouthern.com/pages/local-business-s...
3,https://www.sacbee.com/news/california/water-a...
4,https://www.mcclatchydc.com/news/politics-gove...


In [6]:
cases = []
for case in good_case['url']:
    cases.append(get_features(case,'1'))
good_df = pd.DataFrame(cases)

In [7]:
cases = []
for case in bad_case['url']:
    cases.append(get_features(case,'0'))
bad_df = pd.DataFrame(cases)

In [8]:
all_url = pd.concat([good_df, bad_df], axis=0).fillna('')

In [9]:
train_feature = all_url[f_names]
train_traget = all_url[['label']]
x_train, x_test, y_train, y_test = train_test_split(train_feature, train_traget, random_state=22, test_size=0.2)
# transfer = StandardScaler()
# x_train = transfer.fit_transform(x_train)
# x_test = transfer.transform(x_test)

In [10]:
type(train_feature)

pandas.core.frame.DataFrame

In [11]:
x_train.shape

(126592, 17)

<h3>1.3 RandomForestClassifier with GridSearchCV:</h3><p>

<h5>find best model with through cross validation</h5>

In [2]:
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.model_selection import GridSearchCV


estimator = RandomForestClassifier()
param_dict = {"n_estimators": [100,200,400,800]} 
model = GridSearchCV(estimator, param_grid=param_dict, cv=10)
model.fit(train_feature,train_traget)
joblib.dump(model, dirPath + '/transcoder-us-train/model_us_RF.pkl')

# model_test = joblib.load('/Users/lxy/machine_learning/model/transcoder/model_us.pkl')
# y_predict = model_test.predict(x_test)

NameError: name 'train_feature' is not defined

In [23]:
# best_model = RandomForestClassifier(**model.best_params_)
# best_model.fit(train_feature,train_traget)

In [24]:
print(type(model))
print(model.best_params_)
print(model)

<class 'sklearn.model_selection._search.GridSearchCV'>
{'n_estimators': 200}
GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'n_estimators': [100, 200, 400, 800]})


<h5>calculate accuracy for best_model</h5>

In [25]:
x_array = model.predict(x_test).astype(np.int)
y_array = y_test.to_numpy().flatten().astype(np.int)

In [26]:
acc = weighted_accuracy(x_array, y_array)
test_score = model.score(x_test,y_test)
train_score = model.score(x_train,y_train)
print("weigted accuracy is: "+str(acc))
print("test data score is: "+str(test_score))
print("training data score is: "+str(train_score))

weigted accuracy is: 0.9761035151523119
test data score is: 0.9822105662285137
training data score is: 0.9835455637007078


In [27]:
# best_score_ only exists for refit = true
print(model.best_score_)

0.9231231041456016


<h5>classification report for test data</h5>

In [29]:
from sklearn.metrics import classification_report
y_predict = model.predict(x_test)
report = classification_report(y_test, y_predict, labels=[0,1], target_names=['列表页', '非列表页'])
print(report)

  mask &= (ar1 != a)


              precision    recall  f1-score   support

         列表页       0.98      0.96      0.97      9987
        非列表页       0.98      0.99      0.99     21661

   micro avg       0.98      0.98      0.98     31648
   macro avg       0.98      0.98      0.98     31648
weighted avg       0.98      0.98      0.98     31648



<h5>classification report for trainning data</h5>

In [30]:
y_predict = model.predict(x_train)
report = classification_report(y_train, y_predict, labels=[0,1], target_names=['列表页', '非列表页'])
print(report)

  mask &= (ar1 != a)


              precision    recall  f1-score   support

         列表页       0.98      0.96      0.97     39981
        非列表页       0.98      0.99      0.99     86611

   micro avg       0.98      0.98      0.98    126592
   macro avg       0.98      0.98      0.98    126592
weighted avg       0.98      0.98      0.98    126592



<h3>1.4 Tests</h3><p>

In [32]:
ps.stem('page')

'page'

In [33]:
print('everything is ok')

everything is ok


<h3>1.5 Test for particular url</h3><p>

In [40]:
model_us_test = joblib.load(dirPath + '/transcoder-us-train/model_us_10.pkl')

In [41]:
def get_features_test(url):
    dict = {}
    path = urlparse(url).path
    dict['path'] = path
    dict['length'] = len(url)
    dict['path_length'] = len(path)
    url_term = jieba.cut_for_search(url)
    print(url_term)
    url_term = [x for x in list(url_term) if x not in STOP_WORDS]
    words = []
    for word in url_term:
        word = ps.stem(word)
        words.append(word)
    dict['token_num'] = len(words)
    dict['news_id'] = 1 if re.match(pattern, url) else 0
    dict['cat'] = 1 if 'category' in url_term else 0
    dict['news'] = 1 if 'news' in url_term else 0
    dict['dot'] = path.count('.')
    dict['dash'] = path.count('-')
    dict['slash'] = path.count('/')
    dict['hash'] = path.count('#')
    dict['p_flag'] = 1 if re.match(patternP, url) else 0
    dict['article'] = 1 if 'article' in url_term else 0
    dict['page'] = 1 if 'page' in url_term else 0
    dict['end_with_num'] = 1 if re.match(patternEndNum, url) else 0
    dict['start_flag'] = 1 if re.match(patternStart, url) else 0
    dict['contains_date'] = 1 if re.match(pattern2, url) else 0
    dict['end_with_html'] = 1 if re.match(patternEndHTML, url) else 0
    return dict

In [42]:
def getPredict(url):
    urlFea = get_features_test(url)
    cases = []
    cases.append(urlFea)
    df = pd.DataFrame(cases)
    train_feature = df[f_names]
    return model_us_test.predict(train_feature)[0]

In [43]:
print(getPredict('https://www.mediacongo.net//articles-actualite-9_societe.html'))

<generator object Tokenizer.cut_for_search at 0x1379e2048>
0
