In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

In [4]:
# Load the German Credit dataset from CSV
file_path = 'C:/Users/avi11/Downloads/german_credit.csv'
data = pd.read_csv(file_path)

In [5]:
# Feature Generation: Creating a new feature - Ratio of Credit Amount to Duration of Credit
data['Credit_to_Duration_Ratio'] = data['Credit Amount'] / data['Duration of Credit (month)']

In [6]:
# Assigning the target variable and features (including the newly generated feature)
X = data.drop('Creditability', axis=1)
y = data['Creditability']

In [7]:
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Feature Selection using RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [9]:
# Displaying feature importances
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': clf.feature_importances_})
print("Feature Importances:")
print(feature_importances.sort_values(by='Importance', ascending=False))



Feature Importances:
                              Feature  Importance
4                       Credit Amount    0.119195
20           Credit_to_Duration_Ratio    0.115142
0                     Account Balance    0.099202
1          Duration of Credit (month)    0.089488
12                        Age (years)    0.088670
2   Payment Status of Previous Credit    0.062805
3                             Purpose    0.050035
5                Value Savings/Stocks    0.047289
6        Length of current employment    0.046423
11      Most valuable available asset    0.038724
7                 Instalment per cent    0.037042
8                Sex & Marital Status    0.034403
10        Duration in Current address    0.030042
16                         Occupation    0.026176
14                  Type of apartment    0.022344
15         No of Credits at this Bank    0.020527
9                          Guarantors    0.018113
18                          Telephone    0.018042
13                 Concurrent

In [11]:
# Selecting features based on importance scores
sfm = SelectFromModel(clf, threshold=0.05)  # Adjust threshold as needed
sfm.fit(X_train, y_train)
selected_features = X_train.columns[sfm.get_support()]
print("\nSelected Features:")
print(selected_features)




Selected Features:
Index(['Account Balance', 'Duration of Credit (month)',
       'Payment Status of Previous Credit', 'Purpose', 'Credit Amount',
       'Age (years)', 'Credit_to_Duration_Ratio'],
      dtype='object')


In [10]:
# Training Random Forest using selected features
X_train_selected = sfm.transform(X_train)
X_test_selected = sfm.transform(X_test)

clf_selected = RandomForestClassifier(n_estimators=100, random_state=42)
clf_selected.fit(X_train_selected, y_train)



In [12]:
# Evaluating the model
accuracy = clf_selected.score(X_test_selected, y_test)
print(f"\nAccuracy with selected features: {accuracy:.2f}")


Accuracy with selected features: 0.76
