In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
#from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

#from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

import optuna
from optuna.trial import Trial
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Data Loading
X_model = pd.read_csv('X_model.csv')
Y_model = pd.read_csv('Y_model.csv')
# Define preprocessors
def preprocess(X: pd.DataFrame):
    X = X.fillna(0)
    # Create new dataframe from columns "gender", "age_code", and "region_code"
    X_new = pd.DataFrame(X, columns=['gender', 'age_code', 'region_code'])
    GIT_range = r"202205[0-9]{2}"
    cGIT = np.sum(X.filter(regex=("c" + GIT_range), axis=1).values, axis=1)
    tGIT = np.sum(X.filter(regex=("t" + GIT_range), axis=1).values, axis=1)
    sGIT = np.sum(X.filter(regex=("s" + GIT_range), axis=1).values, axis=1)

    VAT_range = r"20220[17](?:[01][0-9]|2[0-5])"
    tVAT = np.sum(X.filter(regex=("t" + VAT_range), axis=1).values, axis=1)
    sVAT = np.sum(X.filter(regex=("s" + VAT_range), axis=1).values, axis=1)

    entire_range = r"2022[0-9]{4}"
    cEntire = np.sum(X.filter(regex=("c" + entire_range), axis=1).values, axis=1)
    tEntire = np.sum(X.filter(regex=("t" + entire_range), axis=1).values, axis=1)
    sEntire = np.sum(X.filter(regex=("s" + entire_range), axis=1).values, axis=1)

    rcGIT = np.divide(cGIT, cEntire)
    rtGIT = np.divide(tGIT, tEntire)
    rsGIT = np.divide(sGIT, sEntire)

    rtVAT = np.divide(tVAT, tEntire)
    rsVAT = np.divide(sVAT, sEntire)

    X_new['cGIT'] = cGIT
    X_new['tGIT'] = tGIT
    X_new['sGIT'] = sGIT

    X_new['tVAT'] = tVAT
    X_new['sVAT'] = sVAT

    X_new['cEntire'] = cEntire
    X_new['tEntire'] = tEntire
    X_new['sEntire'] = sEntire

    X_new['rcGIT'] = rcGIT
    X_new['rtGIT'] = rtGIT
    X_new['rsGIT'] = rsGIT

    X_new['rtVAT'] = rtVAT
    X_new['rsVAT'] = rsVAT

    X_new = X_new.fillna(0)
    print(X_new.head())

    return X_new
# Preprocess
X_model = preprocess(X_model)

   gender  age_code  region_code  cGIT  tGIT    sGIT  tVAT    sVAT  cEntire  \
0       1        13            7   0.0   0.0     0.0   0.0     0.0      1.0   
1       1         5            1   2.0   0.0    17.0   0.0   185.0     39.0   
2       2         6            2   6.0   3.0  2253.0   0.0  1712.0     26.0   
3       2         1            1   2.0   0.0   342.0   0.0     0.0      5.0   
4       2         5            1   2.0   0.0   273.0   0.0   104.0     10.0   

   tEntire  sEntire     rcGIT  rtGIT     rsGIT  rtVAT     rsVAT  
0      1.0     93.0  0.000000    0.0  0.000000    0.0  0.000000  
1      0.0    790.0  0.051282    0.0  0.021519    0.0  0.234177  
2      3.0   5119.0  0.230769    1.0  0.440125    0.0  0.334440  
3      0.0    647.0  0.400000    0.0  0.528594    0.0  0.000000  
4      1.0    608.0  0.200000    0.0  0.449013    0.0  0.171053  


In [15]:
features = X_model[['sEntire', 'rcGIT', 'rtGIT', 'rsGIT', 'rtVAT', 'rsVAT']]
check = Y_model[['business']]

In [16]:
train_features, test_features, train_labels, test_labels = train_test_split(features, check)

In [17]:
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

In [18]:
model = LogisticRegression()
model.fit(train_features, train_labels)
print(model.score(train_features, train_labels))

0.93397


In [19]:
print(model.score(test_features, test_labels))

0.933485


In [20]:
print(model.coef_)

[[ 0.64869814 -0.01822165  0.28046918 -0.06141629  0.33711658 -0.17327164]]
