In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import string, re, io
import eli5

% matplotlib inline
matplotlib.style.use('ggplot')

  from numpy.core.umath_tests import inner1d


In [2]:
from scipy.sparse import hstack
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [3]:
from lightgbm import LGBMClassifier

In [4]:
class TrainTestHelper(object):
    def __init__(self):
        self.ntrain = None

    def combine(self, train, test):
        self.ntrain = train.shape[0]
        if isinstance(train, np.ndarray):
            return np.row_stack((train, test))
        else:
            return train.append(test, sort=False).reset_index(drop=True)

    def split(self, train_test):
        if self.ntrain is None:
            return None
        if isinstance(train_test, np.ndarray):
            train = train_test[:self.ntrain, :]
            test = train_test[self.ntrain:, :]
        else:
            train = train_test.iloc[:self.ntrain, :].copy().reset_index(drop=True)
            test = train_test.iloc[self.ntrain:, :].copy().reset_index(drop=True)
        return train, test
    
def clean_text(x):
    w = ''
    for i in x:
        if i in string.punctuation:
            w += ' '
        else:
            w += i
    return w

def count_regexp_occ(regexp='', text=None):
    return len(re.findall(regexp, text))

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

def CBOW(x, vec):
    w = np.zeros(300)
    for i in x:
        try:
            w += vec[i]
        except KeyError:
            pass
    return w

In [58]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test_nvPHrOx.csv')
ss = pd.read_csv('sample_submission_poy1UIu.csv')

helper = TrainTestHelper()
data = helper.combine(train, test)

In [59]:
data['ndot_domain'] = data['Domain'].apply(lambda x: str(x).count('.'))
data['ndot_url'] = data['Url'].apply(lambda x: str(x).count('.'))
data['nslash_url'] = data['Url'].apply(lambda x: str(x).count('/'))
data['mail_url'] = data['Url'].apply(lambda x: count_regexp_occ(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', x))
data['nupper_url'] = data['Url'].apply(lambda x: count_regexp_occ(r"[A-Z]", x))
data['time_url'] = data['Url'].apply(lambda x: count_regexp_occ(r"\d{2}|:\d{2}", x))
data['date_long_url'] = data['Url'].apply(lambda x: count_regexp_occ(r"\D\d{2}:\d{2}, \d{1,2} \w+ \d{4}", x))
data['date_short_url'] = data['Url'].apply(lambda x: count_regexp_occ(r"\D\d{1,2} \w+ \d{4}", x))
data['nchick_url'] = data['Url'].apply(lambda x: x.count("!"))
data['nqmark_url'] = data['Url'].apply(lambda x: x.count("?"))
data['http_vs_https'] = data['Url'].apply(lambda x: str(x).startswith('http://')).astype(int)
data['ndigit_domain'] = data['Domain'].apply(lambda x: sum([c.isdigit() for c in str(x)]))
data['ndigit_url'] = data['Url'].apply(lambda x: sum([c.isdigit() for c in str(x)]))                             

data['Url'] = data['Url'].apply(lambda x: str(x).replace('https://', '').replace('http://', ''))
data['Domain_nopunct'] = data['Domain'].apply(lambda x: clean_text(x))
data['Url_nopunct'] = data['Url'].apply(lambda x: clean_text(x))
data['all_text'] = data['Domain_nopunct'] + ' ' + data['Url_nopunct']
data['len_url'] = data['Url'].apply(len)
data['len_domain'] = data['Domain'].apply(len)
data['nword_url'] = data['Url_nopunct'].apply(lambda x: len(x.split(' ')))
data['nword_domain'] = data['Url_nopunct'].apply(lambda x: len(x.split(' ')))
data['rdigit_domain'] = data['ndigit_domain'] / data['len_domain']
data['rdigit_url'] = data['ndigit_url'] / data['len_url']

In [60]:
cols = ['all_text', 'ndot_domain', 'ndot_url', 'nslash_url', 'mail_url', 'nupper_url', 'time_url', 'date_long_url',
        'date_short_url', 'nchick_url', 'nqmark_url', 'http_vs_https', 'len_url', 'len_domain', 'nword_url',
        'nword_domain', 'ndigit_domain', 'ndigit_url', 'rdigit_domain', 'rdigit_url', 'Tag', 'Domain']
data = data[cols]

In [61]:
Train, Test = helper.split(data)
le = LabelEncoder()
le.fit(Train['Tag'])
Train = Train.sort_values(by='Domain').reset_index(drop=True)

In [62]:
train = Train.loc[:25009, :]
valid = Train.loc[25010:, :].reset_index(drop=True)
y_train, y_valid = le.transform(train['Tag']), le.transform(valid['Tag'])
y = le.transform(Train['Tag'])
del Train['Tag'], Test['Tag'], Train['Domain'], Test['Domain'], train['Tag'], valid['Tag'], train['Domain'], valid['Domain']

In [82]:
tfidf = TfidfVectorizer(ngram_range=(1, 1), max_features=200, min_df=5)
char = TfidfVectorizer(ngram_range=(2, 4), max_features=500, analyzer='char')
tfidf.fit(data['all_text'])
char.fit(data['all_text'])

train_tfidf = tfidf.transform(train['all_text'])
valid_tfidf = tfidf.transform(valid['all_text'])
train_char = char.transform(train['all_text'])
valid_char = char.transform(valid['all_text'])

In [83]:
num_feats = ['ndot_url', 'time_url', 'nqmark_url', 'ndigit_domain', 'rdigit_domain', 'rdigit_url']

In [84]:
train_features = hstack([train[num_feats], train_tfidf, train_char])
valid_features = hstack([valid[num_feats], valid_tfidf, valid_char])

In [85]:
lr = LogisticRegression(penalty='l2', C=0.5, random_state=0)
lr.fit(train_features, y_train)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [86]:
valid_pred = lr.predict(valid_features)
print('F1 score:', f1_score(y_valid, valid_pred, average='weighted'))

F1 score: 0.6204493460655778


# Inference

In [78]:
Train_tfidf = tfidf.transform(Train['all_text'])
Test_tfidf = tfidf.transform(Test['all_text'])
Train_char = char.transform(Train['all_text'])
Test_char = char.transform(Test['all_text'])

Train_features = hstack([Train[num_feats], Train_tfidf, Train_char])
Test_features = hstack([Test[num_feats], Test_tfidf, Test_char])

In [79]:
lr = LogisticRegression(penalty='l2', C=0.5, random_state=0)
lr.fit(Train_features, y)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [80]:
ss['Tag'] = le.inverse_transform(lr.predict(Test_features))

  if diff:


In [87]:
ss.to_csv('lr_0.62CV.csv', index=False)