# Libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression

# Training Data Preparation

In [2]:
train = pd.read_csv("./train_data_withlabels.csv")
train = train.iloc[:, 1:]

# change type from string to datetime object
train.localminute = pd.to_datetime(train.localminute)

# create new columns
train['month'] = train.localminute.dt.month
train['date'] = train.localminute.dt.day
train['weekday'] = train.localminute.dt.weekday
train['hour'] = train.localminute.dt.hour
train['minute'] = train.localminute.dt.minute

# remove localminute column
train = train.iloc[:, 1:]

# move target column just for ease
target = train.pop('target')
train['target'] = target

# get column names
col_names = train.columns.to_list()

# normalise the data
target = train.target
feature_a = train.iloc[:,0:7].columns.to_list()
feature_b = train.iloc[:,7:-1].columns.to_list()
normal_standard = ColumnTransformer([('standardscaler', StandardScaler(), feature_a)], 
                                    remainder='passthrough')
normal_minmax = ColumnTransformer([('minmaxscaler', MinMaxScaler(), feature_b)], 
                                  remainder='passthrough')

# use sklearn to normalize data
train_normal = pd.DataFrame(normal_standard.fit_transform(train))
# rename columns
train_normal.columns = col_names
# do the rest
train_normal = pd.DataFrame(normal_minmax.fit_transform(train_normal))

# rename columns
train_normal.columns = feature_b + feature_a + ["target"]

## Training and Validation Process

In [3]:
train_f = train_normal.drop('month', axis = 1)

# convert dates to categorical
train_f['date'] = train_f['date'].astype('category')
train_f['weekday'] = train_f['weekday'].astype('category')
train_f['hour'] = train_f['hour'].astype('category')
train_f['minute'] = train_f['minute'].astype('category')

targ = train_f.target
train_f = train_f.drop('target', axis = 1)
train_f['target'] = targ

# dummies
#train_f = pd.get_dummies(train_f) 

# split to train and validation
validation_x, train_x = train_test_split(train_f, test_size = 0.9, train_size = 0.1)

# separate x and y
train_y = train_x.target
train_x = train_x.drop('target', axis = 1)

validation_y = validation_x.target
validation_x = validation_x.drop('target', axis = 1)

#fit linear regression model
model = sm.OLS(train_y, train_x).fit()

#view model summary
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:                 target   R-squared (uncentered):                   0.421
Model:                            OLS   Adj. R-squared (uncentered):              0.421
Method:                 Least Squares   F-statistic:                          2.495e+04
Date:                Tue, 07 Jun 2022   Prob (F-statistic):                        0.00
Time:                        20:44:51   Log-Likelihood:                      1.1714e+05
No. Observations:              377828   AIC:                                 -2.343e+05
Df Residuals:                  377817   BIC:                                 -2.341e+05
Df Model:                          11                                                  
Covariance Type:            nonrobust                                                  
                            coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

# Test Preparation

## Prepare Training Data

In [4]:
train_y = train_normal.target
train_x = train_normal.drop('target', axis = 1)

train_x = train_x.drop('month', axis = 1)

# convert dates to categorical
train_x['date'] = train_x['date'].astype('category')
train_x['weekday'] = train_x['weekday'].astype('category')
train_x['hour'] = train_x['hour'].astype('category')
train_x['minute'] = train_x['minute'].astype('category')

## Prepare Test Data

In [5]:
feature_a = train.iloc[:,0:7].columns.to_list()
feature_b = train.iloc[:,8:-1].columns.to_list()
normal_standard = ColumnTransformer([('standardscaler', StandardScaler(), feature_a)], 
                                    remainder='passthrough')
normal_minmax = ColumnTransformer([('minmaxscaler', MinMaxScaler(), feature_b)], 
                                  remainder='passthrough')

test = pd.read_csv("./test_data_nolabels.csv")

# transform test data

# get dataid
test_id = test.dataid

# change type from string to datetime object
test.localminute = pd.to_datetime(test.localminute)

# create new columns
test['date'] = test.localminute.dt.day
test['weekday'] = test.localminute.dt.weekday
test['hour'] = test.localminute.dt.hour
test['minute'] = test.localminute.dt.minute

# remove localminute and dataid column
test = test.drop(['dataid', 'localminute'], axis = 1)

col_names = test.columns

# normalise test data
test = pd.DataFrame(normal_standard.fit_transform(test))
# rename columns
test.columns = col_names
# do the rest
test = pd.DataFrame(normal_minmax.fit_transform(test))
# rename columns
test.columns = feature_b + feature_a

# select test columns
#test = test[features]

# convert dates to categorical
test['date'] = test['date'].astype('category')
test['weekday'] = test['weekday'].astype('category')
test['hour'] = test['hour'].astype('category')
test['minute'] = test['minute'].astype('category')

# Predict

In [6]:
lr_model = LogisticRegression(max_iter=1000).fit(train_x, train_y)

lr_test = lr_model.predict(test).astype('int')

lr_final = pd.DataFrame({"dataid" : test_id, "target" : lr_test})
lr_final.to_csv("predict_label.csv", index = False)
print(lr_final.target.value_counts())
print(lr_model.coef_, lr_model.intercept_)

0    98416
1     6536
Name: target, dtype: int64
[[-0.43011004  0.14269358 -1.48071357 -0.08601288  1.83567011 -0.15163288
   0.09867033  1.68454476  0.76459465 -1.35974101 -0.97647455]] [-5.83595267]
