# German Credit Data - Assessment Preparation
This notebook contains structured questions with answer statements and code examples for preparing an assessment.

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from scipy.stats import ttest_ind

# Load dataset
data = pd.read_csv('german_credit_data.csv')
data.head()

## ✅ 1️⃣ Data Exploration & Understanding

### 1.1 How many rows and columns are in the dataset?

In [ ]:
rows, columns = data.shape
print(f'The dataset contains {rows} rows and {columns} columns.')

### 2.2 What are the column names and their data types?

In [ ]:
print(data.dtypes)

### 3.3 Are there any missing values in the dataset?

In [ ]:
print(data.isnull().sum())

### 4.4 Show basic statistics (mean, min, max, std) for numeric columns.

In [ ]:
print(data.describe())

### 5.5 How is the target variable Credit risk distributed? (Good vs Bad)

In [ ]:
print(data['Credit risk'].value_counts())

### 6.6 What do the categorical variables represent? Give examples of their values.

In [ ]:
categorical_columns = data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    print(f"{col}: {data[col].unique()[:5]}")

## ✅ 2️⃣ Data Cleaning & Preprocessing

### 7.1 How would you handle missing values (if any)?

In [ ]:
# Example: Drop rows with missing values
data_cleaned = data.dropna()
print(f'After dropping missing rows: {data_cleaned.shape}')

### 8.2 How do you encode categorical variables (e.g., one-hot encoding, label encoding)?

In [ ]:
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data_cleaned[col] = le.fit_transform(data_cleaned[col])
    label_encoders[col] = le

data_cleaned.head()

### 9.3 How do you convert categorical columns like 'Present employment since' into numeric features?

In [ ]:
# Assuming the column is categorical, we apply label encoding
if 'Present employment since' in data_cleaned.columns:
    le = LabelEncoder()
    data_cleaned['Present employment since'] = le.fit_transform(data_cleaned['Present employment since'])

### 10.4 Normalize or scale numeric columns — when and why?

In [ ]:
# Normalize numeric columns using StandardScaler
scaler = StandardScaler()
numeric_cols = ['Age in years', 'Credit amount', 'Duration in month']
data_cleaned[numeric_cols] = scaler.fit_transform(data_cleaned[numeric_cols])

### 11.5 How would you detect and handle outliers in numeric columns like Credit amount or Duration in month?

In [ ]:
# Example: Remove outliers beyond 3 standard deviations
for col in numeric_cols:
    data_cleaned = data_cleaned[np.abs(data_cleaned[col]) <= 3]

## ✅ 3️⃣ Feature Engineering

### 12.1 Create an Age Group feature (e.g., <30, 30-50, >50).

In [ ]:
data_cleaned['Age Group'] = pd.cut(data_cleaned['Age in years'], bins=[0, 30, 50, 100], labels=['<30', '30-50', '>50'])

### 13.2 Create a binary flag column: High Credit Amount (>10,000).

In [ ]:
data_cleaned['High Credit Amount'] = (data_cleaned['Credit amount'] > scaler.transform([[0, 10000, 0]])[0][1]).astype(int)

### 14.3 Aggregate features: Group by Age Group and compute average Credit amount.

In [ ]:
age_group_avg = data_cleaned.groupby('Age Group')['Credit amount'].mean()
print(age_group_avg)

Group by Credit risk and compute average age.

In [ ]:
credit_risk_avg_age = data_cleaned.groupby('Credit risk')['Age in years'].mean()
print(credit_risk_avg_age)

## ✅ 4️⃣ Data Filtering & Subsetting

### 15.1 Select all records where Credit risk == 2 (Bad credit).

In [ ]:
bad_credit = data_cleaned[data_cleaned['Credit risk'] == 2]
bad_credit.head()

### 16.2 Select records where Foreign worker == 'yes' and Credit risk == 2.

In [ ]:
if 'Foreign worker' in data_cleaned.columns:
    subset = data_cleaned[(data_cleaned['Foreign worker'] == 1) & (data_cleaned['Credit risk'] == 2)]
    subset.head()

### 17.3 Reset index after filtering.

In [ ]:
subset.reset_index(drop=True, inplace=True)
subset.head()

## ✅ 5️⃣ Statistical Analysis

### 18.1 Compute correlation between numeric features (using .corr()).

In [ ]:
print(data_cleaned[numeric_cols].corr())

### 19.2 Relationship between Age and Credit amount.

In [ ]:
sns.scatterplot(x='Age in years', y='Credit amount', data=data_cleaned)
plt.show()

### 20.3 Are any features highly correlated?

In [ ]:
corr_matrix = data_cleaned[numeric_cols].corr()
print(corr_matrix)

### 21.4 Perform hypothesis testing (t-test: average credit amount between good and bad credit risk).

In [ ]:
good_credit = data_cleaned[data_cleaned['Credit risk'] == 1]['Credit amount']
bad_credit = data_cleaned[data_cleaned['Credit risk'] == 2]['Credit amount']

stat, p_value = ttest_ind(good_credit, bad_credit)
print(f'T-test statistic: {stat}, p-value: {p_value}')

## ✅ 6️⃣ Grouping & Aggregation

### 22.1 Group by Age Group and Credit risk and count instances.

In [ ]:
group_counts = data_cleaned.groupby(['Age Group', 'Credit risk']).size()
print(group_counts)

### 23.2 Average Credit amount per Purpose category.

In [ ]:
if 'Purpose' in data_cleaned.columns:
    avg_credit_purpose = data_cleaned.groupby('Purpose')['Credit amount'].mean()
    print(avg_credit_purpose)

### 24.3 Distribution of Housing types for each Credit risk.

In [ ]:
if 'Housing' in data_cleaned.columns:
    housing_dist = data_cleaned.groupby(['Credit risk', 'Housing']).size().unstack()
    print(housing_dist)

## ✅ 7️⃣ Data Visualization

### 25.1 Plot the distribution of Credit amount.

In [ ]:
sns.histplot(data_cleaned['Credit amount'], bins=30, kde=True)
plt.show()

### 26.2 Visualize the count of Credit risk using a bar chart.

In [ ]:
sns.countplot(x='Credit risk', data=data_cleaned)
plt.show()

### 27.3 Plot a boxplot of Credit amount by Credit risk.

In [ ]:
sns.boxplot(x='Credit risk', y='Credit amount', data=data_cleaned)
plt.show()

### 28.4 Plot the correlation heatmap of numeric features.

In [ ]:
sns.heatmap(data_cleaned[numeric_cols].corr(), annot=True)
plt.show()

## ✅ 8️⃣ Machine Learning Related

### 29.1 How do you split the data into train/test sets?

In [ ]:
X = data_cleaned.drop('Credit risk', axis=1)
y = data_cleaned['Credit risk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 30.2 How do you handle categorical variables before modeling?

In [ ]:
# Already encoded earlier using LabelEncoder

### 31.3 Build a simple logistic regression model to predict Credit risk.

In [ ]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

### 32.4 How do you evaluate the model performance?

In [ ]:
print(classification_report(y_test, y_pred))
print('ROC AUC:', roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))

## ✅ 9️⃣ Advanced Insights

### 33.1 What does the cost matrix imply and how would you use it in model evaluation?

In [ ]:
# Explanation in Markdown:

print("Cost matrix penalizes false negatives more than false positives in credit risk assessment because misclassifying bad credit as good is costly.")

### 34.2 Why is it more costly to classify a bad credit as good than vice versa?

In [ ]:
print("False negatives result in financial loss, so the model must minimize them aggressively.")

### 35.3 How would you handle imbalanced classes in this dataset?

In [ ]:
# Example techniques:
print("Use SMOTE, class weights, or downsampling majority class.")

## ✅ 1️⃣0️⃣ File Handling

### 36.1 How do you save filtered subsets to a CSV file?

In [ ]:
subset.to_csv('bad_credit_subset.csv', index=False)

### 37.2 How do you read the saved CSV back into a DataFrame?

In [ ]:
loaded_subset = pd.read_csv('bad_credit_subset.csv')
loaded_subset.head()