In [240]:
# Data processing, CSV file I/O
import pandas as pd
# Pandas display option so int & float values will not be shown in scientific notation
pd.options.display.float_format = '{:.2f}'.format
# Pandas display option so output will be printed with all of the columns
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# Scikit-learn frameworks needed to build the predictive model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


In [241]:
df = pd.read_csv('preprocess.csv')
df_nonstandardize = df.copy()

#### Standardizing numeric inputs

In [242]:
transformer = ColumnTransformer(transformers=[
    ('scaler', StandardScaler(), ['age', 'allowance_permonth', 'ordering_amount', 'spending_amount', 'goingout_amount'])
], remainder='passthrough')
df[['age', 'allowance_permonth', 'ordering_amount', 'spending_amount', 'goingout_amount']] = transformer.fit_transform(df[['age', 'allowance_permonth', 'ordering_amount', 'spending_amount', 'goingout_amount']])

#### Building predictive model (Standardized values)

In [243]:

logreg = LogisticRegression()
result = []
def predictionModel(output):
    for i in range(100):
        predictors = df.drop(['interest_output', 'expense_output'], axis=1)
        target = df[output]
        x_train, x_val, y_train, y_val = train_test_split(predictors, target, test_size = 0.20, random_state = i)
        logreg = LogisticRegression()
        logreg.fit(x_train, y_train)
        y_pred = logreg.predict(x_val)
        acc_logreg = round(accuracy_score(y_pred, y_val) * 100, 2)
        result.append(acc_logreg)

Interest score

In [244]:
predictionModel('interest_output')
result_series = pd.Series(result)
result_series.mean()

66.23150000000001

Expense score

In [245]:
predictionModel('expense_output')
result_series = pd.Series(result)
result_series.mean()

63.88540000000001

##### Accuracy result without standardizing

In [246]:

logreg = LogisticRegression()
result2 = []
def predictionModel(output):
    for i in range(100):
        predictors = df_nonstandardize.drop(['interest_output', 'expense_output'], axis=1)
        target = df_nonstandardize[output]
        x_train, x_val, y_train, y_val = train_test_split(predictors, target, test_size = 0.20, random_state = i)
        logreg = LogisticRegression()
        logreg.fit(x_train, y_train)
        y_pred = logreg.predict(x_val)
        acc_logreg = round(accuracy_score(y_pred, y_val) * 100, 2)
        result2.append(acc_logreg)

Interest score

In [247]:
predictionModel('interest_output')
result_series2 = pd.Series(result2)
# result_series.mean()
result_series2.mean()

73.7695

Expense score

In [249]:
predictionModel('expense_output')
result_series2 = pd.Series(result2)
# result_series.mean()
result_series2.mean()

72.693