In [0]:
from google.colab import drive 
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_log_error, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR

import re

from lightgbm import LGBMRegressor, LGBMClassifier, Dataset
from scipy.sparse import hstack, vstack, csr_matrix, save_npz, load_npz
from joblib import dump, load

In [0]:
train = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/ml_sport/data/train.csv")
test = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/ml_sport/data/Xtest.csv")
sample = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/ml_sport/data/sample_submission.csv")

# features

In [0]:
train = train.fillna('')
test = test.fillna('')

In [0]:
train['upper_count'] = train['Password'].apply(lambda x: sum([1 if i.isupper() else 0 for i in x]))
train['pas_len'] = train['Password'].str.len()
train['digit_count'] = train['Password'].apply(lambda x: sum([1 if i.isdigit() else 0 for i in x]))
train['max_digit_len'] = train['Password'].str.findall(r'[0-9]+').apply(
    lambda x: max([len(i) for i in x]) if x != [] else 0)
train['max_lower_let_len'] = train['Password'].str.findall(r'[a-z]+').apply(
    lambda x: max([len(i) for i in x]) if x != [] else 0)
train['max_upper_let_len'] = train['Password'].str.findall(r'[a-z]+').apply(
    lambda x: max([len(i) for i in x]) if x != [] else 0)
train['max_spec_symb_len'] = train['Password'].str.findall(r'[\W]+').apply(
    lambda x: max([len(i) for i in x]) if x != [] else 0)
train['spec_count'] = train['Password'].str.findall('\W').apply(len)
train['underline_count'] = train['Password'].str.findall('_').apply(len)
train['first'] = train['Password'].apply(lambda x: x[0] if len(x) > 0 else '')
train['first_is_upper'] = train['first'].str.isupper() * 1
train['first_is_digit'] = train['first'].str.isdigit() * 1
train['first_is_spec'] = train['first'].str.findall('\W').apply(len)
train['first_is_underline'] = train['first'].str.findall('_').apply(len)
train['last'] = train['Password'].apply(lambda x: x[-1] if len(x) > 0 else '')
train['last_is_upper'] = train['last'].str.isupper() * 1
train['last_is_digit'] = train['last'].str.isdigit() * 1
train['last_is_spec'] = train['last'].str.findall('\W').apply(len)
train['last_is_underline'] = train['last'].str.findall('_').apply(len)

In [0]:
test['upper_count'] = test['Password'].apply(lambda x: sum([1 if i.isupper() else 0 for i in x]))
test['pas_len'] = test['Password'].str.len()
test['digit_count'] = test['Password'].apply(lambda x: sum([1 if i.isdigit() else 0 for i in x]))
test['max_digit_len'] = test['Password'].str.findall(r'[0-9]+').apply(
    lambda x: max([len(i) for i in x]) if x != [] else 0)
test['max_lower_let_len'] = test['Password'].str.findall(r'[a-z]+').apply(
    lambda x: max([len(i) for i in x]) if x != [] else 0)
test['max_upper_let_len'] = test['Password'].str.findall(r'[a-z]+').apply(
    lambda x: max([len(i) for i in x]) if x != [] else 0)
test['max_spec_symb_len'] = test['Password'].str.findall(r'[\W]+').apply(
    lambda x: max([len(i) for i in x]) if x != [] else 0)
test['spec_count'] = test['Password'].str.findall('\W').apply(len)
test['underline_count'] = test['Password'].str.findall('_').apply(len)
test['first'] = test['Password'].apply(lambda x: x[0] if len(x) > 0 else '')
test['first_is_upper'] = test['first'].str.isupper() * 1
test['first_is_digit'] = test['first'].str.isdigit() * 1
test['first_is_spec'] = test['first'].str.findall('\W').apply(len)
test['first_is_underline'] = test['first'].str.findall('_').apply(len)
test['last'] = test['Password'].apply(lambda x: x[-1] if len(x) > 0 else '')
test['last_is_upper'] = test['last'].str.isupper() * 1
test['last_is_digit'] = test['last'].str.isdigit() * 1
test['last_is_spec'] = test['last'].str.findall('\W').apply(len)
test['last_is_underline'] = test['last'].str.findall('_').apply(len)

# split for validation

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    train.drop(columns=['Times']), train['Times'],
    test_size=0.05, random_state=42,
    shuffle=True
)

In [0]:
tfidf = TfidfVectorizer(lowercase=False, analyzer='char',
                        ngram_range=(1, 6), max_df=0.999,
                        min_df=0.001)

In [0]:
%%time
X_train_tfidf = tfidf.fit_transform(X_train['Password'])
X_test_tfidf = tfidf.transform(X_test['Password'])

In [0]:
test_tfidf = tfidf.transform(test['Password'])

In [0]:
oh_f = OneHotEncoder(drop='first', dtype=int)
oh_l = OneHotEncoder(drop='first', dtype=int)

In [0]:
X_train_first = oh_f.fit_transform(X_train[['first']])
X_test_first = oh_f.transform(X_test[['first']])

X_train_last = oh_l.fit_transform(X_train[['last']])
X_test_last = oh_l.transform(X_test[['last']])

In [0]:
test_first = oh_f.transform(test[['first']])
test_last = oh_l.transform(test[['last']])

In [0]:
X_train_first = csr_matrix(X_train_first)
X_train_last = csr_matrix(X_train_last)
X_test_first = csr_matrix(X_test_first)
X_test_last = csr_matrix(X_test_last)

In [0]:
test_first = csr_matrix(test_first)
test_last = csr_matrix(test_last)

In [0]:
X_train_csr = csr_matrix(X_train.drop(columns=['Password', 'first', 'last']))
X_test_csr = csr_matrix(X_test.drop(columns=['Password', 'first', 'last']))

In [0]:
test_csr = csr_matrix(test.drop(columns=['Id', 'Password', 'first', 'last']))

In [0]:
X_tr = hstack([X_train_tfidf, X_train_first, X_train_last, X_train_csr])
X_val = hstack([X_test_tfidf, X_test_first, X_test_last, X_test_csr])

In [0]:
X_tr_1 = hstack([X_train_first, X_train_last, X_train_csr])
X_val_1 = hstack([X_test_first, X_test_last, X_test_csr])

In [0]:
test_total = hstack([test_tfidf, test_first, test_last, test_csr])

In [0]:
test_total_1 = hstack([test_first, test_last, test_csr])

In [0]:
test_total, X_tr

In [0]:
save_npz('X_tr.npz', X_tr)
save_npz('X_val.npz', X_val)
save_npz('X_tr_1.npz', X_tr_1)
save_npz('X_val_1.npz', X_val_1)

# model

## lgbm

In [0]:
LGBMClassifier?

In [0]:
reg = LGBMClassifier(
    class_weight='balanced',
    max_depth=7,
    metric='recall',
    random_state=42,
    n_jobs=-1,
    n_estimators=1000,
    learning_rate=0.1)

In [0]:
reg.fit(X_tr, 1 * (y_train > 1))

In [0]:
np.max([preds, np.zeros(len(preds))], axis=0)[:10].round()

In [0]:
preds = reg.predict(X_val)

In [0]:
np.sqrt(mean_squared_log_error(y_test, np.max([preds, np.zeros(len(preds))], axis=0)))

In [0]:
(y_test > 1).mean()

In [0]:
print(classification_report(1 * (y_test > 1), preds))

## sgdreg

In [0]:
sgd = SGDRegressor(random_state=42,
                   loss='epsilon_insensitive',
                   penalty='l2',
                   learning_rate='adaptive',
                   alpha=0.0001, warm_start=True,
                   max_iter=100, tol=1e-3)

In [0]:
sgd.fit(X_tr, y_train)

In [0]:
preds_sgd = sgd.predict(X_val)
np.sqrt(mean_squared_log_error(y_test, preds_sgd))

In [0]:
np.sqrt(mean_squared_log_error(y_test, (preds_sgd + preds + 1) / 2))

In [0]:
preds

In [0]:
p1 = reg.predict(test_total)
p2 = sgd.predict(test_total)
p = (p1 + p2 + 1) / 2

In [0]:
sample['Times'] = p

In [0]:
sample.to_csv('/content/gdrive/My Drive/Colab Notebooks/ml_sport/data/kim_sgdreg_lgbmbin_mix_half.csv', index=None)