In [None]:
!pip install torch torchvision torchaudio numpy pandas scikit-learn

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import pickle

# 1. 데이터 로드
user_data_file = '/content/drive/MyDrive/shinhan_data/realistic_korean_financial_user_data.csv'
loan_application_file = '/content/drive/MyDrive/shinhan_data/realistic_korean_loan_applications.csv'
loan_product_file = '/content/drive/MyDrive/shinhan_data/realistic_korean_credit_loan_products.csv'

user_data_df = pd.read_csv(user_data_file)
loan_applications_df = pd.read_csv(loan_application_file)
loan_products_df = pd.read_csv(loan_product_file)

# 2. 데이터 전처리

# Combine the user and loan data based on the loan application data
df = loan_applications_df.merge(user_data_df, on='user_id').merge(loan_products_df, on='loan_id')

# Separate features into categorical and numerical
categorical_features = ['user_id', 'loan_id', 'gender', 'region', 'occupation', 'late_payment', 'financial_accident']
numerical_features = ['age', 'annual_income', 'debt', 'credit_score', 'annual_spending', 'num_cards',
                      'interest_rate', 'loan_limit', 'loan_term_months', 'credit_score_requirement','total_deposit', 'total_savings','total_assets']

# Label encoding for categorical features
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    label_encoders[feature] = le

# Normalize numerical features
scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Save the LabelEncoders and Scaler to a pickle file
preprocessing_dict = {
    'label_encoders': label_encoders,
    'scaler': scaler,
    'categorical_features': categorical_features,
    'numerical_features': numerical_features
}

with open('loan_preprocessing.pkl', 'wb') as f:
    pickle.dump(preprocessing_dict, f)

# Prepare inputs for the model
X_categorical = [df[feature].values for feature in categorical_features]
X_numerical = df[numerical_features].values
y = df['loan_applied'].values

# 3. DeepFM 모델 정의

# Define input layers for categorical and numerical features
input_layers = []
embedding_layers = []

for feature in categorical_features:
    input_layer = Input(shape=(1,), name=feature)
    embedding_layer = Embedding(input_dim=df[feature].nunique(), output_dim=4, name=f"{feature}_embedding")(input_layer)
    embedding_layer = Flatten()(embedding_layer)
    input_layers.append(input_layer)
    embedding_layers.append(embedding_layer)

# Numerical features input
numerical_input = Input(shape=(len(numerical_features),), name='numerical_input')
input_layers.append(numerical_input)
embedding_layers.append(numerical_input)

# Concatenate all embeddings and numerical features
concatenated = Concatenate()(embedding_layers)

# Deep part of the model
from tensorflow.keras.regularizers import l2

x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(concatenated)
x = Dropout(0.4)(x)
x = Dense(32, activation='relu', kernel_regularizer=l2(0.01))(x)
x = Dropout(0.4)(x)

# Output layer
output = Dense(1, activation='sigmoid')(x)

from tensorflow.keras.callbacks import EarlyStopping

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model = Model(inputs=input_layers, outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_categorical + [X_numerical], y, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Save the trained model
model.save('loan_deepfm_model.h5')


In [None]:
# 기존 카테고리에 없는 새로운 값을 처리하기 위한 함수
def encode_with_unknown_handling(feature, encoder, value):
    if value in encoder.classes_:
        return encoder.transform([value])[0]
    else:
        # 새로운 값에 대해 '미지정' 또는 다른 대체 값을 사용하도록 처리
        return encoder.transform([encoder.classes_[0]])[0]  # 첫 번째 클래스로 대체

# 예측 수행을 위한 전처리 및 데이터 보완
def preprocess_and_predict(new_user_data, model, label_encoders, scaler, categorical_features, numerical_features, loan_products_df):
    # loan_id로부터 나머지 대출 상품 정보를 가져옴
    loan_info = get_loan_product_info(new_user_data['loan_id'][0], loan_products_df)

    # 새로운 데이터에 대출 상품 정보 추가
    new_user_data['interest_rate'] = [loan_info['interest_rate']]
    new_user_data['loan_limit'] = [loan_info['loan_limit']]
    new_user_data['loan_term_months'] = [loan_info['loan_term_months']]
    new_user_data['credit_score_requirement'] = [loan_info['credit_score_requirement']]

    # 1. Label Encoding: 새로운 데이터에 대해 이전에 학습된 LabelEncoder 적용
    for feature in categorical_features:
        if feature in label_encoders:
            new_user_data[feature] = new_user_data[feature].apply(lambda x: encode_with_unknown_handling(feature, label_encoders[feature], x))

    # 2. Normalize numerical features
    new_user_data[numerical_features] = scaler.transform(new_user_data[numerical_features])

    # 3. Prepare input for prediction
    X_categorical = [new_user_data[feature].values for feature in categorical_features]
    X_numerical = new_user_data[numerical_features].values

    # 4. Predict using the trained model
    prediction = model.predict(X_categorical + [X_numerical])

    return prediction

# 여러 loan_id에 대해 예측을 수행하고, 가장 높은 선택 확률을 가지는 대출 상품을 찾는 함수
def find_best_loan_product(new_user_data, model, label_encoders, scaler, categorical_features, numerical_features, loan_products_df):
    best_loan_id = None
    best_prediction = 0.0

    for loan_id in loan_products_df['loan_id'].unique():
        # 현재 loan_id에 대한 대출 상품 정보 추가
        new_user_data['loan_id'] = [loan_id]

        # loan_id에 해당하는 대출 상품 정보를 가져와서 new_user_data에 추가
        loan_info = get_loan_product_info(loan_id, loan_products_df)
        new_user_data['interest_rate'] = [loan_info['interest_rate']]
        new_user_data['loan_limit'] = [loan_info['loan_limit']]
        new_user_data['loan_term_months'] = [loan_info['loan_term_months']]
        new_user_data['credit_score_requirement'] = [loan_info['credit_score_requirement']]

        # 예측 수행
        prediction = preprocess_and_predict(new_user_data, model, label_encoders, scaler, categorical_features, numerical_features, loan_products_df)

        if prediction[0][0] > best_prediction:
            best_prediction = prediction[0][0]
            best_loan_id = loan_id

    return best_loan_id, best_prediction

# 새로운 사용자 데이터
new_user_data = pd.DataFrame({
    'user_id': [100005],  # 새 유저의 ID
    'gender': [1],  # 성별
    'region': ['서울'],  # 지역
    'occupation': ['무직'],  # 직업
    'late_payment': [0],  # 연체 여부
    'financial_accident': [0],  # 금융 사고 여부
    'age': [40],  # 나이
    'annual_income': [1500],  # 연간 소득
    'debt': [300],  # 부채
    'credit_score': [750],  # 신용 점수
    'annual_spending': [1000],  # 연간 소비
    'num_cards': [1],  # 보유 카드 수
    'total_deposit': [0],
    'total_savings': [500],
    'total_assets': [200]
})

# 모든 대출 상품 중에서 가장 선택할 확률이 높은 대출 상품 찾기
best_loan_id, best_prediction = find_best_loan_product(new_user_data, model, label_encoders, scaler, categorical_features, numerical_features, loan_products_df)
print(f"가장 선택할 확률이 높은 대출 상품 ID: {best_loan_id}, 선택 확률: {best_prediction}")


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import pickle

# 1. 데이터 로드
user_data_file = '/content/drive/MyDrive/shinhan_data/realistic_korean_financial_user_data.csv'
saving_product_file = '/content/drive/MyDrive/shinhan_data/realistic_korean_savings_products.csv'
saving_application_file = '/content/drive/MyDrive/shinhan_data/realistic_korean_savings_applications.csv'

user_data_df = pd.read_csv(user_data_file)
saving_applications_df = pd.read_csv(saving_application_file)
saving_products_df = pd.read_csv(saving_product_file)

# 2. 데이터 전처리
# Combine the user and savings data based on the savings application data
df = saving_applications_df.merge(user_data_df, on='user_id').merge(saving_products_df, on='savings_id')

# Separate features into categorical and numerical
categorical_features = ['user_id', 'savings_id', 'gender', 'region', 'occupation', 'late_payment', 'financial_accident']
numerical_features = ['age', 'annual_income', 'debt', 'credit_score', 'annual_spending', 'num_cards',
                      'savings_interest_rate', 'term_months', 'min_savings_amount', 'max_savings_amount']

# Label encoding for categorical features
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    label_encoders[feature] = le

# Normalize numerical features
scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Save the LabelEncoders and Scaler to a pickle file
preprocessing_dict = {
    'label_encoders': label_encoders,
    'scaler': scaler,
    'categorical_features': categorical_features,
    'numerical_features': numerical_features
}

with open('savings_preprocessing.pkl', 'wb') as f:
    pickle.dump(preprocessing_dict, f)

# Prepare inputs for the model
X_categorical = [df[feature].values for feature in categorical_features]
X_numerical = df[numerical_features].values
y = df['savings_applied'].values  # 예적금 상품에 대한 응답

# DeepFM 모델 정의
def build_savings_deepfm_model(input_dim, categorical_dims, embed_dim=8):
    inputs = []
    embeddings = []

    # Embedding for categorical features
    for dim in categorical_dims:
        input_layer = Input(shape=(1,))
        embedding = Embedding(input_dim=dim, output_dim=embed_dim)(input_layer)
        embedding = Flatten()(embedding)
        inputs.append(input_layer)
        embeddings.append(embedding)

    # Numerical features input
    numerical_input = Input(shape=(len(numerical_features),))
    inputs.append(numerical_input)
    embeddings.append(numerical_input)

    # Concatenate all embeddings and numerical features
    concatenated = Concatenate()(embeddings)

    # Deep part of the model
    x = Dense(64, activation='relu')(concatenated)
    x = Dropout(0.5)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.3)(x)
    output = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=inputs, outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    return model

categorical_dims = [df[feature].nunique() for feature in categorical_features]
model = build_savings_deepfm_model(len(X_categorical) + len(X_numerical), categorical_dims)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# 모델 학습
model.fit(X_categorical + [X_numerical], y, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# 학습된 모델 저장
model.save('savings_deepfm_model.h5')


In [None]:
# 기존 카테고리에 없는 새로운 값을 처리하기 위한 함수
def encode_with_unknown_handling(feature, encoder, value):
    if value in encoder.classes_:
        return encoder.transform([value])[0]
    else:
        # 새로운 값에 대해 '미지정' 또는 다른 대체 값을 사용하도록 처리
        return encoder.transform([encoder.classes_[0]])[0]  # 첫 번째 클래스로 대체

# 예적금 상품 정보를 가져오는 함수
def get_saving_product_info(saving_id, saving_products_df):
    return saving_products_df[saving_products_df['savings_id'] == saving_id].iloc[0]

# 예측 수행을 위한 전처리 및 데이터 보완
def preprocess_and_predict(new_user_data, model, label_encoders, scaler, categorical_features, numerical_features):
    # 1. Label Encoding: 새로운 데이터에 대해 이전에 학습된 LabelEncoder 적용
    for feature in categorical_features:
        if feature in label_encoders:
            new_user_data[feature] = new_user_data[feature].apply(lambda x: encode_with_unknown_handling(feature, label_encoders[feature], x))

    # 2. Normalize numerical features
    new_user_data[numerical_features] = scaler.transform(new_user_data[numerical_features])

    # 3. Prepare input for prediction
    X_categorical = [new_user_data[feature].values for feature in categorical_features]
    X_numerical = new_user_data[numerical_features].values

    # 4. Predict using the trained model
    prediction = model.predict(X_categorical + [X_numerical])

    return prediction

# 여러 saving_id에 대해 예측을 수행하고, 가장 높은 선택 확률을 가지는 예적금 상품을 찾는 함수
def recommend_saving_product(new_user_data, model, label_encoders, scaler, categorical_features, numerical_features, saving_products_df):
    best_saving_id = None
    best_prediction = 0.0

    for saving_id in saving_products_df['savings_id'].unique():
        # 현재 saving_id에 대한 예적금 상품 정보 추가
        saving_info = get_saving_product_info(saving_id, saving_products_df)
        new_user_data['savings_id'] = [saving_id]
        new_user_data['savings_interest_rate'] = [saving_info['savings_interest_rate']]
        new_user_data['term_months'] = [saving_info['term_months']]
        new_user_data['min_savings_amount'] = [saving_info['min_savings_amount']]
        new_user_data['max_savings_amount'] = [saving_info['max_savings_amount']]

        # 예측 수행
        prediction = preprocess_and_predict(new_user_data, model, label_encoders, scaler, categorical_features, numerical_features)

        if prediction[0][0] > best_prediction:
            best_prediction = prediction[0][0]
            best_saving_id = saving_id

    return best_saving_id, best_prediction

# 새로운 사용자 데이터
new_user_data = pd.DataFrame({
    'user_id': [100001],  # 새 유저의 ID
    'gender': [1],  # 성별
    'region': ['서울'],  # 지역
    'occupation': ['대기업 직원'],  # 직업
    'late_payment': [0],  # 연체 여부
    'financial_accident': [0],  # 금융 사고 여부
    'age': [42],  # 나이
    'annual_income': [4000],  # 연간 소득
    'debt': [2000],  # 부채
    'credit_score': [750],  # 신용 점수
    'annual_spending': [2000],  # 연간 소비
    'num_cards': [2],  # 보유 카드 수
    'min_balance': [100]  # 최소 잔액
})

# 모든 예적금 상품 중에서 가장 선택할 확률이 높은 예적금 상품 찾기
best_saving_id, best_prediction = recommend_saving_product(new_user_data, model, label_encoders, scaler, categorical_features, numerical_features, saving_products_df)
print(f"가장 추천할 예적금 상품 ID: {best_saving_id}, 선택 확률: {best_prediction}")


In [None]:
import pickle

# 1. 데이터 로드
user_data_file = '/content/drive/MyDrive/shinhan_data/realistic_korean_financial_user_data.csv'
deposit_product_file = '/content/drive/MyDrive/shinhan_data/realistic_korean_deposit_products.csv'
deposit_application_file = '/content/drive/MyDrive/shinhan_data/realistic_korean_deposit_applications.csv'

user_data_df = pd.read_csv(user_data_file)
deposit_applications_df = pd.read_csv(deposit_application_file)
deposit_products_df = pd.read_csv(deposit_product_file)

# 2. 데이터 전처리
# Combine the user and deposit data based on the deposit application data
df = deposit_applications_df.merge(user_data_df, on='user_id').merge(deposit_products_df, on='deposit_id')

# Separate features into categorical and numerical
categorical_features = ['user_id', 'deposit_id', 'gender', 'region', 'occupation', 'late_payment', 'financial_accident']
numerical_features = ['age', 'annual_income', 'debt', 'credit_score', 'annual_spending', 'num_cards',
                      'deposit_interest_rate', 'term_months', 'min_deposit_amount', 'max_deposit_amount']

# Label encoding for categorical features
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    label_encoders[feature] = le

# Normalize numerical features
scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Prepare inputs for the model
X_categorical = [df[feature].values for feature in categorical_features]
X_numerical = df[numerical_features].values
y = df['deposit_applied'].values  # 예금 상품에 대한 응답

# LabelEncoders와 Scaler를 pickle로 저장
preprocessing_dict = {
    'label_encoders': label_encoders,
    'scaler': scaler,
    'categorical_features': categorical_features,
    'numerical_features': numerical_features
}

with open('deposit_preprocessing.pkl', 'wb') as f:
    pickle.dump(preprocessing_dict, f)

# DeepFM 모델 정의
def build_deposit_deepfm_model(input_dim, categorical_dims, embed_dim=8):
    inputs = []
    embeddings = []

    # Embedding for categorical features
    for dim in categorical_dims:
        input_layer = Input(shape=(1,))
        embedding = Embedding(input_dim=dim, output_dim=embed_dim)(input_layer)
        embedding = Flatten()(embedding)
        inputs.append(input_layer)
        embeddings.append(embedding)

    # Numerical features input
    numerical_input = Input(shape=(len(numerical_features),))
    inputs.append(numerical_input)
    embeddings.append(numerical_input)

    # Concatenate all embeddings and numerical features
    concatenated = Concatenate()(embeddings)

    # Deep part of the model
    x = Dense(64, activation='relu')(concatenated)
    x = Dropout(0.5)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.3)(x)
    output = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=inputs, outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    return model

categorical_dims = [df[feature].nunique() for feature in categorical_features]
model = build_deposit_deepfm_model(len(X_categorical) + len(X_numerical), categorical_dims)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# 모델 학습
model.fit(X_categorical + [X_numerical], y, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

model.save('deposit_deepfm_model.h5')


In [None]:
user_data_file = '/content/drive/MyDrive/shinhan_data/realistic_korean_financial_user_data.csv'
user_data_df = pd.read_csv(user_data_file)

In [None]:
user_data.head()

In [None]:
import pandas as pd

# 기존 카테고리에 없는 새로운 값을 처리하기 위한 함수
def encode_with_unknown_handling(feature, encoder, value):
    if value in encoder.classes_:
        return encoder.transform([value])[0]
    else:
        # 새로운 값에 대해 '미지정' 또는 다른 대체 값을 사용하도록 처리
        return encoder.transform([encoder.classes_[0]])[0]  # 첫 번째 클래스로 대체

# 예금 상품 정보를 가져오는 함수
def get_deposit_product_info(deposit_id, deposit_products_df):
    return deposit_products_df[deposit_products_df['deposit_id'] == deposit_id].iloc[0]

# 예측 수행을 위한 전처리 및 데이터 보완
def preprocess_and_predict(new_user_data, model, label_encoders, scaler, categorical_features, numerical_features):
    # 1. Label Encoding: 새로운 데이터에 대해 이전에 학습된 LabelEncoder 적용
    for feature in categorical_features:
        if feature in label_encoders:
            new_user_data[feature] = new_user_data[feature].apply(lambda x: encode_with_unknown_handling(feature, label_encoders[feature], x))

    # 2. Normalize numerical features
    new_user_data[numerical_features] = scaler.transform(new_user_data[numerical_features])

    # 3. Prepare input for prediction
    X_categorical = [new_user_data[feature].values for feature in categorical_features]
    X_numerical = new_user_data[numerical_features].values

    # 4. Predict using the trained model
    prediction = model.predict(X_categorical + [X_numerical])

    return prediction

# 여러 deposit_id에 대해 예측을 수행하고, 상위 N개의 예금 상품을 추천하는 함수
def recommend_top_n_deposit_products(new_user_data, model, label_encoders, scaler, categorical_features, numerical_features, deposit_products_df, top_n=5):
    predictions = []

    for deposit_id in deposit_products_df['deposit_id'].unique():
        # 현재 deposit_id에 대한 예금 상품 정보 추가
        deposit_info = get_deposit_product_info(deposit_id, deposit_products_df)
        new_user_data['deposit_id'] = [deposit_id]
        new_user_data['deposit_interest_rate'] = [deposit_info['deposit_interest_rate']]
        new_user_data['term_months'] = [deposit_info['term_months']]
        new_user_data['min_deposit_amount'] = [deposit_info['min_deposit_amount']]
        new_user_data['max_deposit_amount'] = [deposit_info['max_deposit_amount']]

        # 예측 수행
        prediction = preprocess_and_predict(new_user_data, model, label_encoders, scaler, categorical_features, numerical_features)
        predictions.append((deposit_id, prediction[0][0]))

    # 예측 확률을 기준으로 상위 N개의 예금 상품을 정렬하여 선택
    top_n_predictions = sorted(predictions, key=lambda x: x[1], reverse=True)[:top_n]

    return top_n_predictions

# 새로운 사용자 데이터
new_user_data = pd.DataFrame({
    'user_id': [100001],  # 새 유저의 ID
    'gender': [0],  # 성별
    'region': ['서울'],  # 지역
    'occupation': ['대기업 직원'],  # 직업
    'late_payment': [1],  # 연체 여부
    'financial_accident': [0],  # 금융 사고 여부
    'age': [24],  # 나이
    'annual_income': [3500],  # 연간 소득
    'debt': [800],  # 부채
    'credit_score': [800],  # 신용 점수
    'annual_spending': [1000],  # 연간 소비
    'num_cards': [2],  # 보유 카드 수
    # 'min_balance': [100]  # 최소 잔액
})

# Top 5 예금 상품 추천
top_5_deposit_products = recommend_top_n_deposit_products(new_user_data, model, label_encoders, scaler, categorical_features, numerical_features, deposit_products_df, top_n=5)
print("Top 5 추천 예금 상품 및 확률:")
for deposit_id, prediction in top_5_deposit_products:
    print(f"예금 상품 ID: {deposit_id}, 선택 확률: {prediction}")
