In [57]:
import re
import os
import sys
import glob
import codecs
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from web_classifer.data import clean_text

sys.path.append("../")
raw_data_dir = "../data/raw_html"
processed_data_dir = "../data/data.csv"
failed_files_path = "../data/failed_to_process_files.csv"

labels = [label for label in os.listdir(raw_data_dir)]

all_files = glob.glob(f"{raw_data_dir}/**/*.htm",recursive=True)
len(all_files)

119266

In [58]:
labels

['auto',
 'book',
 'camera',
 'job',
 'movie',
 'nbaplayer',
 'restaurant',
 'university']

In [103]:
data_list = []
cleaned_text_list = []
label_list = []
failed_files = []
processed_files = []
n_rows = 100
for data_file in tqdm(all_files[:n_rows]):
    try:
      html_code=codecs.open(data_file,'r')
      soup = BeautifulSoup(html_code, 'html.parser')  #Parse html code
      texts = soup.findAll(text=True)                 #find all text
      label = re.findall("|".join(labels),data_file)[0]
      label_list.append(label)
      text_from_html = ' '.join(texts) 
      data_list.append(text_from_html)
      cleaned_text = clean_text(text_from_html)
      cleaned_text_list.append(cleaned_text)
      processed_files.append(data_file)
      
    except Exception as e:
        # print(e)
        failed_files.append((data_file, str(e)))
        
failed_df = pd.DataFrame(failed_files, columns=["filepath", "exception"]).to_csv(failed_files_path, index=False)


100%|██████████| 100/100 [00:11<00:00,  8.79it/s]


In [74]:
failed_file_paths = [ file_path for file_path, _ in failed_files]
processed_files = [ file_path for file_path in all_files if file_path not in failed_file_paths]

In [75]:
len(all_files), len(data_list), len(cleaned_text_list), len(label_list), len(failed_files), len(processed_files)

(119266, 113905, 113905, 113905, 5361, 113905)

In [76]:
data = pd.DataFrame.from_dict({'data_path': processed_files, 'raw_html_text':data_list, 'cleaned_text':cleaned_text_list, 'label': label_list})
data.head()

Unnamed: 0,data_path,raw_html_text,cleaned_text,label
0,../data/raw_html\auto\auto-autobytel(2000)\102...,"ï»¿ \n HTML PUBLIC ""-//W3C//DTD HTML 4.01 Tran...",html public w3c dtd html 4 01 transitional en ...,auto
1,../data/raw_html\auto\auto-autobytel(2000)\102...,"ï»¿ \n HTML PUBLIC ""-//W3C//DTD HTML 4.01 Tran...",html public w3c dtd html 4 01 transitional en ...,auto
2,../data/raw_html\auto\auto-autobytel(2000)\102...,"ï»¿ \n HTML PUBLIC ""-//W3C//DTD HTML 4.01 Tran...",html public w3c dtd html 4 01 transitional en ...,auto
3,../data/raw_html\auto\auto-autobytel(2000)\102...,"ï»¿ \n HTML PUBLIC ""-//W3C//DTD HTML 4.01 Tran...",html public w3c dtd html 4 01 transitional en ...,auto
4,../data/raw_html\auto\auto-autobytel(2000)\102...,"ï»¿ \n HTML PUBLIC ""-//W3C//DTD HTML 4.01 Tran...",html public w3c dtd html 4 01 transitional en ...,auto


In [77]:
# re.findall("|".join(labels),data_file)

In [86]:
cols = ["data_path","cleaned_text",	"label"]
data["cleaned_text"] = data["cleaned_text"].apply(lambda x: str(x).encode("utf-8", errors="ignore").decode("utf-8", errors="ignore"))
data.to_csv("../data/all_data.csv", index=False)


In [91]:
train_data = data.groupby('label').sample(n=2000, random_state=42)
train_data[cols].to_csv("../data/train.csv", index=False)

Modeling


In [95]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [93]:
from sklearn.model_selection import train_test_split
train_df,  test_df = train_test_split(
     train_data, test_size=0.33, random_state=42)

In [99]:
n_vocabulary = 50000
text_clf = Pipeline([
     ('tfidf',  TfidfVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_features=n_vocabulary)),
     ('clf', MultinomialNB()),
])

In [101]:
import numpy as np
text_clf.fit(train_df.cleaned_text, train_df.label)

predicted = text_clf.predict(test_df.cleaned_text)
np.mean(predicted == test_df.label)

0.9884469696969697

In [102]:
from sklearn import metrics
print(metrics.classification_report(test_df.label, predicted,
     target_names=test_df.label.unique()))

              precision    recall  f1-score   support

       movie       1.00      1.00      1.00       646
      camera       1.00      0.91      0.95       669
         job       0.92      1.00      0.96       695
        auto       1.00      1.00      1.00       644
   nbaplayer       1.00      1.00      1.00       706
  university       1.00      1.00      1.00       645
        book       1.00      1.00      1.00       651
  restaurant       1.00      1.00      1.00       624

    accuracy                           0.99      5280
   macro avg       0.99      0.99      0.99      5280
weighted avg       0.99      0.99      0.99      5280



In [None]:



# cat_cols = []
# date_col = []
# num_cols = []
# text_features = ['num_words', 'num_unique_words', 'num_chars', 'num_stopwords', 'num_punctuations', 'num_words_upper','num_words_title', 'mean_word_len']
target = 'label'
text_col = 'cleaned_text'
# features = cat_cols + num_cols + text_col + text_features + date_features

# train_df[cat_cols] = train_df[cat_cols].astype(str)
# test_df[cat_cols] = test_df[cat_cols].astype(str)
# https://www.kaggle.com/adamschroeder/countvectorizer-tfidfvectorizer-predict-comments#Max_df
# https://programminghistorian.org/en/lessons/analyzing-documents-with-tfidf
n_vocabulary = 50000
preprocessor = ColumnTransformer(
     transformers=[
     #     ('numerical', MinMaxScaler(), num_cols),
        ('text', TfidfVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_features=n_vocabulary), text_col), #max_df=1.0, min_df=1,
     #    ('category', OneHotEncoder(handle_unknown='ignore'), cat_cols),
     ],
remainder='passthrough')

train_x = preprocessor.fit_transform(train_df[features])
test_x = preprocessor.transform(test_df[features])

Base line

In [None]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor(max_depth=10, random_state=42)

regr.fit(train_x, train_df[target],n_jobs=-1)

predicted = regr.predict(test_x)
test_df['score'] = predicted

test_df[['id','score']].to_csv('./sk_randomforest_base_line_submission.csv',index=False)

5 fold cv

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split
import lightgbm as lgb
# out-of-fold predictions on train data
oof = np.zeros(train_x.shape[0])

# averaged predictions on train data
prediction = np.zeros(test_x.shape[0])

# list of scores on folds
scores = []
feature_importance = pd.DataFrame()
# n_estimators = 
params = {'num_leaves': 128,
          'min_child_samples': 100,
          'objective': 'regression',
          'max_depth': 7,
          'learning_rate': 0.25,
          "boosting_type": "gbdt",
          "subsample_freq": 1,
          "subsample": 0.9,
          "bagging_seed": 11,
          "metric": 'mae',
          "verbosity": -1,
          'reg_alpha': 0.1,
          'reg_lambda': 0.3,
          'colsample_bytree': 1.0
         }
verbose=500
early_stopping_rounds=200
n_estimators=3000
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)
# split and train on folds
# https://www.kaggle.com/artgor/using-meta-features-to-improve-model#Training-separate-models-for-each-type
for fold_n, (train_index, valid_index) in enumerate(folds.split(train_x)):

    print(f'Training on Fold {fold_n + 1}')
    X_train, X_valid = train_x[train_index,:], train_x[valid_index,:]
    # y_train, y_valid = y[train_index], y[valid_index]

    # X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = train_df[target].iloc[train_index], train_df[target].iloc[valid_index]

    model = lgb.LGBMRegressor(**params, n_estimators = n_estimators, n_jobs = -1)
    model.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='mae',
            verbose=verbose, early_stopping_rounds=early_stopping_rounds)
    
    y_pred_valid = model.predict(X_valid)
    y_pred = model.predict(test_x, num_iteration=model.best_iteration_)
        
    oof[valid_index] = y_pred_valid.reshape(-1,)
    scores.append(metrics.mean_absolute_error(y_valid, y_pred_valid))
    prediction += y_pred 

prediction /= folds.n_splits

print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

In [None]:
test_df['score'] = prediction

test_df[['id','score']].to_csv(f'./lgbm_{n_fold}_50k_vocabulary_fold_cv_ensemble_submission.csv',index=False)


Grid search cv

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(train_x, train_df[target])
print("Grid search CV best parameters", grid_search.best_params_)

best_grid = grid_search.best_estimator_
predicted = best_grid.predict(test_x)
test_df['score'] = predicted

test_df[['id','score']].to_csv('./sk_randomforest_grid_searchcv_submission.csv',index=False)