In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [28]:
data = pd.read_csv("german_credit_data_target.csv")

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        1000 non-null   int64 
 1   Age               1000 non-null   int64 
 2   Sex               1000 non-null   object
 3   Job               1000 non-null   int64 
 4   Housing           1000 non-null   object
 5   Saving accounts   817 non-null    object
 6   Checking account  606 non-null    object
 7   Credit amount     1000 non-null   int64 
 8   Duration          1000 non-null   int64 
 9   Purpose           1000 non-null   object
 10  Risk              1000 non-null   object
dtypes: int64(5), object(6)
memory usage: 86.1+ KB


In [30]:
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [85]:
y = data['Risk']
X = data[['Age']]

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [87]:
# Обучение модели на реальных данных
model = LogisticRegression()
model.fit(X_train, y_train)

In [89]:
# Предсказание на тестовых данных для реальных данных
y_pred = model.predict(X_test)

In [101]:
# Оценка точности на реальных данных
accuracy = accuracy_score(y_test, y_pred)
print("Оценка точности:", accuracy)

Оценка точности: 0.705


In [31]:
# Генерация новых данных
def generate_data(n_samples):
    generated_data = pd.DataFrame(columns=data.columns)
    for col in data.columns:
        if col in ['Age', 'Job', 'Credit amount', 'Duration']:
            # Генерация случайных значений в соответствии с распределением исходных данных
            mean = data[col].mean()
            std = data[col].std()
            generated_col = np.random.normal(loc=mean, scale=std, size=n_samples)
        else:
            # Генерация случайных значений для категориальных переменных
            unique_values = data[col].unique()
            generated_col = np.random.choice(unique_values, size=n_samples)
        generated_data[col] = generated_col
    return generated_data

In [91]:
# Генерация 100000 наборов данных
generated_data = generate_data(100000)

In [92]:
generated_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        100000 non-null  int64  
 1   Age               100000 non-null  float64
 2   Sex               100000 non-null  object 
 3   Job               100000 non-null  float64
 4   Housing           100000 non-null  object 
 5   Saving accounts   79983 non-null   object 
 6   Checking account  75041 non-null   object 
 7   Credit amount     100000 non-null  float64
 8   Duration          100000 non-null  float64
 9   Purpose           100000 non-null  object 
 10  Risk              100000 non-null  object 
dtypes: float64(4), int64(1), object(6)
memory usage: 8.4+ MB


In [93]:
# Вывод сгенерированных данных для проверки
generated_data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,266,17.090164,male,1.957341,rent,rich,rich,1740.408679,26.514688,business,bad
1,609,36.518813,female,2.222584,own,,,279.55983,30.575322,car,bad
2,424,34.818288,male,1.393137,own,rich,rich,1431.231805,22.223539,vacation/others,good
3,176,45.595252,male,0.675993,own,quite rich,little,7605.0221,28.170314,business,bad
4,422,41.946276,male,1.084751,rent,rich,moderate,6405.333476,24.925042,vacation/others,bad


In [95]:
# Подготовка данных
y_generated = generated_data['Risk']
X_generated = generated_data[['Age']]

In [96]:
# Разделение данных на обучающую и тестовую выборки
X_train_generated, X_test_generated, y_train_generated, y_test_generated = train_test_split(X_generated, y_generated, test_size=0.2, random_state=42)

In [97]:
# Обучение модели на сгенерированных данных
model_generated = LogisticRegression()
model_generated.fit(X_train_generated, y_train_generated)

In [98]:
# Предсказание на тестовых данных для сгенерированных данных
y_pred_generated = model_generated.predict(X_test_generated)

In [100]:
# Оценка точности на сгенерированных данных
accuracy_generated = accuracy_score(y_test_generated, y_pred_generated)
print("Оценка точности:", accuracy_generated)

Оценка точности: 0.50035
