In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix

# Part 3

df = pd.read_csv('/content/sample_data/train_data_with_groups.csv')
x_train, x_test, y_train, y_test = train_test_split(df.drop(['Bankrupt?', 'Group'], axis=1), df['Group'], train_size=0.8, random_state=42)

df = df[df['Group'] == 0]
x_train, x_test, y_train, y_test = train_test_split(df.drop(['Bankrupt?', 'Group'], axis=1), df['Bankrupt?'], train_size=0.8, random_state=42)

# New models
model1 = AdaBoostClassifier(n_estimators=100, random_state=42).fit(x_train, y_train)
model2 = GradientBoostingClassifier(n_estimators=100, random_state=42).fit(x_train, y_train)
model3 = KNeighborsClassifier().fit(x_train, y_train)

model1_pred = model1.predict(x_test)
model2_pred = model2.predict(x_test)
model3_pred = model3.predict(x_test)

print('Model 1 (AdaBoost) accuracy:', accuracy_score(y_test, model1_pred))
print('Model 2 (Gradient Boosting) accuracy:', accuracy_score(y_test, model2_pred))
print('Model 3 (KNeighbors) accuracy:', accuracy_score(y_test, model3_pred))

# Printing the 5 most important features for AdaBoostClassifier
print('Top 5 Important Features for AdaBoostClassifier:')
feature_importance_model1 = list(zip(x_train.columns, model1.feature_importances_))
feature_importance_model1.sort(key=lambda x: x[1], reverse=True)
for i, (feature, importance) in enumerate(feature_importance_model1[:5]):
    print(f'{i+1}. Feature: {feature}, Importance: {importance:.4f}')

# Printing the 5 most important features for GradientBoostingClassifier
print('Top 5 Important Features for GradientBoostingClassifier:')
feature_importance_model2 = list(zip(x_train.columns, model2.feature_importances_))
feature_importance_model2.sort(key=lambda x: x[1], reverse=True)
for i, (feature, importance) in enumerate(feature_importance_model2[:5]):
    print(f'{i+1}. Feature: {feature}, Importance: {importance:.4f}')

combined_pred = np.column_stack((model1_pred, model2_pred, model3_pred))
metaModel = LogisticRegression().fit(combined_pred, y_test)
y_pred = metaModel.predict(combined_pred)
print('Meta Model Accuracy:', accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))



Model 1 (AdaBoost) accuracy: 0.9646107178968655
Model 2 (Gradient Boosting) accuracy: 0.967644084934277
Model 3 (KNeighbors) accuracy: 0.9696663296258847
Top 5 Important Features for AdaBoostClassifier:
1. Feature:  Retained Earnings to Total Assets, Importance: 0.0600
2. Feature:  ROA(B) before interest and depreciation after tax, Importance: 0.0500
3. Feature:  Operating profit per person, Importance: 0.0500
4. Feature:  Cash/Total Assets, Importance: 0.0500
5. Feature:  Cash Flow to Total Assets, Importance: 0.0500
Top 5 Important Features for GradientBoostingClassifier:
1. Feature:  Net Income to Stockholder's Equity, Importance: 0.1983
2. Feature:  Borrowing dependency, Importance: 0.1302
3. Feature:  Net Value Per Share (B), Importance: 0.0741
4. Feature:  Cash/Current Liability, Importance: 0.0417
5. Feature:  Net profit before tax/Paid-in capital, Importance: 0.0412
Meta Model Accuracy: 0.9706774519716885
[[955   2]
 [ 27   5]]


In [6]:
# Part 4

df = pd.read_csv('/content/sample_data/train_data_with_groups.csv')

# New stacking classifier with different estimators
stacking_clf = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('ab', AdaBoostClassifier(random_state=42)),
        ('knn', KNeighborsClassifier())
    ],
    final_estimator=GradientBoostingClassifier(random_state=43),
    cv=10
)

stacking_clf.fit(x_train, y_train)

y_pred = stacking_clf.predict(x_test)
print('Stacking Classifier Accuracy:', accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Part 4 - Cross-validation with new stacking classifier
cross_val_scores = cross_val_score(stacking_clf, x_train, y_train, cv=10, scoring='accuracy')
print("Cross-Validation Scores:", cross_val_scores)


Stacking Classifier Accuracy: 0.9686552072800809
[[950   7]
 [ 24   8]]
Cross-Validation Scores: [0.96717172 0.96464646 0.95707071 0.97222222 0.97727273 0.97721519
 0.96202532 0.97468354 0.95696203 0.96962025]
