In [1]:
import pandas as pd
import math

In [68]:
# 1, Data Load
print('#1 Data Load')

main_path = "C:/Users/DS7/Desktop/Programming/AI Platform/data/"
file_name = input("Name of file : ")
print(main_path + file_name)
df = pd.read_csv(main_path + file_name)
df.head(3)

#1 Data Load
C:/Users/DS7/Desktop/Programming/AI Platform/data/sample_data3.csv


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,label
0,8.9,0.12,0.45,1.8,0.075,10.0,21.0,0.99552,3.41,0.76,11.9,7,0
1,5.6,0.19,0.46,1.1,0.032,33.0,115.0,0.9909,3.36,0.5,10.4,6,1
2,6.0,0.26,0.32,3.5,0.028,29.0,113.0,0.9912,3.4,0.71,12.3,7,1


In [71]:
def exploration_data(df):
    rows = len(df.index)
    columns = df.columns

    print(f'Total Rows : {rows}')
    print(f'Total Columns : {len(columns)}')
    for i in range(len(columns)):
        col_name = columns[i]
        print(f'Column Name : {col_name}, NA Count : {df[col_name].isna().sum()}')

In [70]:
def preprocessing_numeric(df):
    numeric_list = ['int64', 'int32', 'float64', 'float32']
    numeric_count = 0

    columns = df.columns

    print(f'Col Name / Col Outlier Count, Range / Min, Max (Numeric)')
    for i in range(len(columns)):
        col_name = columns[i]
        col_type = df[col_name].dtype
        if col_type in numeric_list:
            numeric_count += 1

            # final_max = -1 * math.inf
            # final_min = math.inf

            Q3 = df[col_name].quantile(0.75)
            Q1 = df[col_name].quantile(0.25)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            col_out = len(df[col_name][(df[col_name] < lower) | (df[col_name] > upper)])

            col_max = df[col_name].max()
            col_min = df[col_name].min()

            print(f'{numeric_count + 1} : {col_name} / {col_out}, Range : ({-1.5 * IQR}, {1.5 * IQR}) / {col_min} / {col_max}')
        else: continue
    print(f'Numeric Col Count : {numeric_count}')

def preprocessing_nominal(df, nominal_list):
    print(f'Columns to OneHot : {nominal_list}')

    df = pd.get_dummies(df, columns=nominal_list)
    return df

def preprocessing_ordinal(df, ordinal_list):
    print(f'Columns to Ordinal : {ordinal_list}')
    
    for i in range(len(ordinal_list)):
        unique_values = df[ordinal_list[i]].unique()
        print(f'Column Name : {ordinal_list[i]}')
        print(f'Unique Values of Column : {unique_values}')

        convert_values = []
        for j in range(len(unique_values)):
            val = input(f'Input order of {unique_values[j]}')
            convert_values.append(val)

        df[ordinal_list[i]].replace(unique_values, convert_values, inplace=True)
    return df

def preprocessing_delete(df, delete_list):
    print(f'Columns to delete : {delete_list}')
    
    df.drop(columns=delete_list, axis=1, inplace=True)
    return df

def preprocessing_NA(df, method):
    if method == 'mean': df.fillna(df.mean(), inplace=True)
    elif method == 'mode': df.fillna(df.mode(), inplace=True)
    elif method == 'median': df.fillna(df.median(), inplace=True)
    elif method == 'zero': df.fillna(0, inplace=True)
    elif method == 'delete': df.dropna(axis=0, inplace=True)
    else: print("Still in NA data")
    return df

In [96]:
# 2, Data Exploration and Preprocessing
print('#2 Data Exploration and Preprocessing')

# Check the na data
explore_data = False
if explore_data: exploration_data(df)

# Check the numeric data
check_numeric = False
if check_numeric: preprocessing_numeric(df)

# Check the nominal data
check_norminal = False
if check_norminal: 
    temp_list = input("Input norminal cols : (ex. col1,col2)")
    norminal_list = temp_list.split(',')
    df = preprocessing_nominal(df, norminal_list)

# Check the ordinal data
check_ordinal = False
if check_ordinal:
    temp_list = input("Input ordinal cols : (ex. col1,col2)")
    ordinal_list = temp_list.split(',')
    df = preprocessing_ordinal(df, ordinal_list)

# Delete Col List
check_delete = False
if check_delete:
    temp_list = input("Input delete cols : (ex. col1,col2)")
    delete_list = temp_list.split(',')
    df = preprocessing_delete(df, delete_list)

# Check NA Rows
check_NA = False
if check_NA:
    method = input("How to do NA : (ex. mean, mode, median, zero, delete)")
    df = preprocessing_NA(df, method)

#2 Data Exploration and Preprocessing


In [73]:
# 3, Data Split
print('#3 Data Split')
from sklearn.model_selection import train_test_split
# garagepl
y_column = input("Input Target Column (Class) : ")
reidx_columns = [col for col in df.columns if col != y_column]
reidx_columns.append(y_column)

test_ratio = 0.2
# test_ratio = input("Input the test ratio : (ex. 0.2)")
df = df[reidx_columns]

df_X = df.iloc[:, :-1]
df_y = df.iloc[:, -1]

df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, df_y, test_size=test_ratio, stratify=df_y)

#3 Data Split


In [74]:
# 4, Numeric Data Normalization
print('#4, Numeric Data Normalization')
from sklearn.preprocessing import StandardScaler, MinMaxScaler

numeric_columns = df.select_dtypes(include='number').columns
print(f'Numeric Data Columns : {numeric_columns}')

ss_columns = ['chlorides']
# input 값 받기
mm_columns = ['quality']

min_value = 0
# input 값 받기
max_value = 1

ss_scaler = StandardScaler()
mm_scaler = MinMaxScaler(feature_range=(min_value, max_value))

df_X_train[ss_columns] = ss_scaler.fit_transform(df_X_train[ss_columns])
df_X_train[mm_columns] = mm_scaler.fit_transform(df_X_train[mm_columns])

df_X_test[ss_columns] = ss_scaler.transform(df_X_test[ss_columns])
df_X_test[mm_columns] = mm_scaler.transform(df_X_test[mm_columns])

Numeric Data Columns : Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'label'],
      dtype='object')


In [None]:
# Data Save
preprocessed_df = df
preprocessed_df.to_csv(main_path + "preprocessed_" + file_name, index=False)
df.head(3)

In [76]:
# 5, Model Create and Learning
print('#5, Model Create and Learning')
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(df_X_train, df_y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [131]:
model.coef_[0]

array([-7.01774488e-01, -7.29013079e+00,  3.33309633e+00,  1.44476332e-01,
       -1.02137277e+00, -3.89782286e-02,  6.28493510e-02,  1.59404636e+00,
       -1.07256868e+00, -5.67730012e+00,  5.94627896e-01, -6.32556942e-03])

In [132]:
model.intercept_

array([1.752458])

In [127]:
import statsmodels.api as sm
results = sm.OLS(df_y_train, sm.add_constant(df_X_train)).fit()

In [133]:
results.params

const                   133.226478
fixed acidity             0.044459
volatile acidity         -0.468584
citric acid               0.148207
residual sugar            0.052149
chlorides                -0.025797
free sulfur dioxide      -0.002869
total sulfur dioxide      0.002986
density                -133.150235
pH                        0.158787
sulphates                -0.125974
alcohol                  -0.112203
quality                  -0.094646
dtype: float64

In [136]:
results.summary()['coef']

TypeError: 'Summary' object is not subscriptable

In [129]:
# 6, Model Information
print('#6, Model Information')

pos_max = 0
pos_min = math.inf
neg_max = -1 * math.inf
neg_min = 0

columns = df.columns
opt_list = [[-1, neg_min], [-1, neg_max], [-1, pos_min], [1, pos_max]]

coef_list = model.coef_[0]
for i in range(len(coef_list)):
    coef = coef_list[i] # coef_list 내에서의 계수 값
    if coef > 0: # 계수 값 양수일 때
        if coef > pos_max:
            pos_max_idx = i
            pos_max = coef
            opt_list[3][0] = columns[pos_max_idx]
            opt_list[3][1] = pos_max
        if coef < pos_min:
            pos_min_idx = i
            pos_min = coef
            opt_list[2][0] = columns[pos_min_idx]
            opt_list[2][1] = pos_min
    elif coef < 0: # 계수 값 음수일 때
        if coef > neg_max:
            neg_max_idx = i
            neg_max = coef
            opt_list[1][0] = columns[neg_max_idx]
            opt_list[1][1] = neg_max
        if coef < neg_min:
            neg_min_idx = i
            neg_min = coef
            opt_list[0][0] = columns[neg_min_idx]
            opt_list[0][1] = neg_min
    else: continue

#6, Model Information


In [130]:
opt_list

[['volatile acidity', -7.290130790114555],
 ['quality', -0.006325569422913468],
 ['total sulfur dioxide', 0.06284935102969127],
 ['citric acid', 3.3330963256329613]]

In [None]:
# 7, Model Evaluation
print('#7, Model Evaluation')

y_pred = model.predict(df_X_test)
y_test = df_y_test.to_numpy()

from sklearn.model_selection import cross_val_score, LeaveOneGroupOut

In [None]:
# * Features Selection
# * Hyper Parameters

# 2. Preprocessing Numeric List 미사용 후 dtypes로 확인

# Data Split 진행시 Random State Param 확인
# sklearn, statsmodels 결과 확인 (coef, params)
# * Find p-values of Trained Model