# Necessary libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder, StandardScaler, normalize
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (10, 6)
pd.set_option('display.max_columns', None)


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


Classifying Loan Status Using Decision Trees
    
    Dataset:
Lending Club Loan Data
    
    Preprocessing Steps:
Handle missing values if any.

Encode categorical variables (e.g., one-hot encoding for loan grade, sub-grade, etc.).

Standardize numerical features.
   
    Task: 1
Implement a decision tree classifier to classify loan status and evaluate the model using accuracy and ROC-AUC.


In [3]:
loan_data = pd.read_csv('/content/drive/MyDrive/Datasets /BWF_Week6_Datasets/LendingClubLoan.csv')
loan_data.head()

Unnamed: 0.1,Unnamed: 0,emp_title,emp_length,state,homeownership,annual_income,verified_income,debt_to_income,annual_income_joint,verification_income_joint,debt_to_income_joint,delinq_2y,months_since_last_delinq,earliest_credit_line,inquiries_last_12m,total_credit_lines,open_credit_lines,total_credit_limit,total_credit_utilized,num_collections_last_12m,num_historical_failed_to_pay,months_since_90d_late,current_accounts_delinq,total_collection_amount_ever,current_installment_accounts,accounts_opened_24m,months_since_last_credit_inquiry,num_satisfactory_accounts,num_accounts_120d_past_due,num_accounts_30d_past_due,num_active_debit_accounts,total_debit_limit,num_total_cc_accounts,num_open_cc_accounts,num_cc_carrying_balance,num_mort_accounts,account_never_delinq_percent,tax_liens,public_record_bankrupt,loan_purpose,application_type,loan_amount,term,interest_rate,installment,grade,sub_grade,issue_month,loan_status,initial_listing_status,disbursement_method,balance,paid_total,paid_principal,paid_interest,paid_late_fees
0,1,global config engineer,3.0,NJ,MORTGAGE,90000.0,Verified,18.01,,,,0,38.0,2001,6,28,10,70795,38767,0,0,38.0,0,1250,2,5,5.0,10,0.0,0,2,11100,14,8,6,1,92.9,0,0,moving,individual,28000,60,14.07,652.53,C,C3,Mar-2018,Current,whole,Cash,27015.86,1999.33,984.14,1015.19,0.0
1,2,warehouse office clerk,10.0,HI,RENT,40000.0,Not Verified,5.04,,,,0,,1996,1,30,14,28800,4321,0,1,,0,0,0,11,8.0,14,0.0,0,3,16500,24,14,4,0,100.0,0,1,debt_consolidation,individual,5000,36,12.61,167.54,C,C1,Feb-2018,Current,whole,Cash,4651.37,499.12,348.63,150.49,0.0
2,3,assembly,3.0,WI,RENT,40000.0,Source Verified,21.15,,,,0,28.0,2006,4,31,10,24193,16000,0,0,28.0,0,432,1,13,7.0,10,0.0,0,3,4300,14,8,6,0,93.5,0,0,other,individual,2000,36,17.09,71.4,D,D1,Feb-2018,Current,fractional,Cash,1824.63,281.8,175.37,106.43,0.0
3,4,customer service,1.0,PA,RENT,30000.0,Not Verified,10.16,,,,0,,2007,0,4,4,25400,4997,0,1,,0,0,1,1,15.0,4,0.0,0,2,19400,3,3,2,0,100.0,1,0,debt_consolidation,individual,21600,36,6.72,664.19,A,A3,Jan-2018,Current,whole,Cash,18853.26,3312.89,2746.74,566.15,0.0
4,5,security supervisor,10.0,CA,RENT,35000.0,Verified,57.96,57000.0,Verified,37.66,0,,2008,7,22,16,69839,52722,0,0,,0,0,1,6,4.0,16,0.0,0,10,32700,20,15,13,0,100.0,0,0,credit_card,joint,23000,36,14.07,786.87,C,C3,Mar-2018,Current,whole,Cash,21430.15,2324.65,1569.85,754.8,0.0


In [8]:
# Printing information of dataset
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 56 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Unnamed: 0                        10000 non-null  int64  
 1   emp_title                         9167 non-null   object 
 2   emp_length                        9183 non-null   float64
 3   state                             10000 non-null  object 
 4   homeownership                     10000 non-null  object 
 5   annual_income                     10000 non-null  float64
 6   verified_income                   10000 non-null  object 
 7   debt_to_income                    9976 non-null   float64
 8   annual_income_joint               1495 non-null   float64
 9   verification_income_joint         1455 non-null   object 
 10  debt_to_income_joint              1495 non-null   float64
 11  delinq_2y                         10000 non-null  int64  
 12  month

### Handling Missing Values

In [9]:
loan_data.isnull().sum()

Unnamed: 0                             0
emp_title                            833
emp_length                           817
state                                  0
homeownership                          0
annual_income                          0
verified_income                        0
debt_to_income                        24
annual_income_joint                 8505
verification_income_joint           8545
debt_to_income_joint                8505
delinq_2y                              0
months_since_last_delinq            5658
earliest_credit_line                   0
inquiries_last_12m                     0
total_credit_lines                     0
open_credit_lines                      0
total_credit_limit                     0
total_credit_utilized                  0
num_collections_last_12m               0
num_historical_failed_to_pay           0
months_since_90d_late               7715
current_accounts_delinq                0
total_collection_amount_ever           0
current_installm

In [4]:
# Removing these columns because it has large amount of missing values.
loan_data.drop(['annual_income_joint', 'verification_income_joint', 'debt_to_income_joint', 'months_since_last_delinq', 'months_since_90d_late'], axis=1, inplace=True)

In [5]:
# mode imputation for 'object' features
loan_data.fillna({'emp_title': loan_data['emp_title'].mode()[0]}, inplace=True)

In [6]:
# Using mean imputation on 'float' features. Since remaining all features are numerical, we will use direct approach.
for feature in loan_data[['emp_length', 'debt_to_income', 'months_since_last_credit_inquiry', 'num_accounts_120d_past_due']]:
    loan_data.fillna({feature: loan_data[feature].mean()}, inplace=True)

In [7]:
# Checking for the missing values in the dataset
print('Missing values in the dataset:', loan_data.isnull().sum())

Missing values in the dataset: Unnamed: 0                          0
emp_title                           0
emp_length                          0
state                               0
homeownership                       0
annual_income                       0
verified_income                     0
debt_to_income                      0
delinq_2y                           0
earliest_credit_line                0
inquiries_last_12m                  0
total_credit_lines                  0
open_credit_lines                   0
total_credit_limit                  0
total_credit_utilized               0
num_collections_last_12m            0
num_historical_failed_to_pay        0
current_accounts_delinq             0
total_collection_amount_ever        0
current_installment_accounts        0
accounts_opened_24m                 0
months_since_last_credit_inquiry    0
num_satisfactory_accounts           0
num_accounts_120d_past_due          0
num_accounts_30d_past_due           0
num_active_debit_ac

**Now there is no missing values in the dataset after imputation now lets proceed with the next step.**

### Encoding Categorical Variables

In [8]:
# First we have to select only categorical columns/features

categorical_features = loan_data.select_dtypes(include=['object']).columns
categorical_features

Index(['emp_title', 'state', 'homeownership', 'verified_income',
       'loan_purpose', 'application_type', 'grade', 'sub_grade', 'issue_month',
       'loan_status', 'initial_listing_status', 'disbursement_method'],
      dtype='object')

In [9]:
numerical_features = loan_data.select_dtypes(include=['float64', 'int64']).columns
numerical_features

Index(['Unnamed: 0', 'emp_length', 'annual_income', 'debt_to_income',
       'delinq_2y', 'earliest_credit_line', 'inquiries_last_12m',
       'total_credit_lines', 'open_credit_lines', 'total_credit_limit',
       'total_credit_utilized', 'num_collections_last_12m',
       'num_historical_failed_to_pay', 'current_accounts_delinq',
       'total_collection_amount_ever', 'current_installment_accounts',
       'accounts_opened_24m', 'months_since_last_credit_inquiry',
       'num_satisfactory_accounts', 'num_accounts_120d_past_due',
       'num_accounts_30d_past_due', 'num_active_debit_accounts',
       'total_debit_limit', 'num_total_cc_accounts', 'num_open_cc_accounts',
       'num_cc_carrying_balance', 'num_mort_accounts',
       'account_never_delinq_percent', 'tax_liens', 'public_record_bankrupt',
       'loan_amount', 'term', 'interest_rate', 'installment', 'balance',
       'paid_total', 'paid_principal', 'paid_interest', 'paid_late_fees'],
      dtype='object')

In [12]:
# dropping column with name Unnamed because it is not making any sense in the data
numerical_features.drop('Unnamed: 0')

Index(['emp_length', 'annual_income', 'debt_to_income', 'delinq_2y',
       'earliest_credit_line', 'inquiries_last_12m', 'total_credit_lines',
       'open_credit_lines', 'total_credit_limit', 'total_credit_utilized',
       'num_collections_last_12m', 'num_historical_failed_to_pay',
       'current_accounts_delinq', 'total_collection_amount_ever',
       'current_installment_accounts', 'accounts_opened_24m',
       'months_since_last_credit_inquiry', 'num_satisfactory_accounts',
       'num_accounts_120d_past_due', 'num_accounts_30d_past_due',
       'num_active_debit_accounts', 'total_debit_limit',
       'num_total_cc_accounts', 'num_open_cc_accounts',
       'num_cc_carrying_balance', 'num_mort_accounts',
       'account_never_delinq_percent', 'tax_liens', 'public_record_bankrupt',
       'loan_amount', 'term', 'interest_rate', 'installment', 'balance',
       'paid_total', 'paid_principal', 'paid_interest', 'paid_late_fees'],
      dtype='object')

In [13]:
# Applying label encoder to the categorical features
encoder = LabelEncoder()

for feature in categorical_features:
    loan_data[feature] = encoder.fit_transform(loan_data[feature])

### Standardizing Numerical Features

In [16]:
scaler = StandardScaler()

for feature in numerical_features:
    loan_data[feature] = scaler.fit_transform(loan_data[feature].values.reshape(-1, 1))

### Model Training

In [17]:
# Separating the features and the target variable
X = loan_data.drop('loan_status', axis=1)
y = loan_data['loan_status']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

In [24]:
# Predicting the target variable
y_pred = decision_tree.predict(X_test)

### Model Evaluation

In [25]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the model:", accuracy)

Accuracy of the model: 0.9685


 Predicting Hospital Readmission Using Logistic Regression
    
    Dataset:
   
   Hospital Readmission Dataset
    
    Preprocessing Steps:
Handle missing values (e.g., fill missing values with mode for categorical variables).


Encode categorical variables (e.g., one-hot encoding for hospital type, region, etc.).


Standardize numerical features.
    
    Task: 2
Implement logistic regression to predict hospital readmission and evaluate the model using precision, recall, and F1-score.


In [5]:
hospital_data = pd.read_csv('/content/drive/MyDrive/Datasets /BWF_Week6_Datasets/HospitalReadmissions.csv')
hospital_data.head()

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,Respiratory,Other,no,no,no,yes,no
1,[70-80),3,34,2,13,0,0,0,Other,Other,Other,Other,no,no,no,yes,no
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,yes,yes,yes
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,Other,Diabetes,no,no,yes,yes,yes
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,Circulatory,Respiratory,no,no,no,yes,no


Classifying Digit Images Using Decision Trees
    
    Dataset:
  MNIST Dataset
    
    Preprocessing Steps:
Normalize pixel values.

Reshape data as needed for model input.
    
    Task: 3
Implement a decision tree classifier to classify handwritten digits and evaluate the model using accuracy and confusion matrix.


Predicting Loan Approval Using Logistic Regression

    Dataset:
Loan Prediction Dataset

    Preprocessing Steps:
Handle missing values (e.g., fill missing values with mode for categorical variables).

Encode categorical variables (e.g., one-hot encoding for gender, married status, etc.).

Standardize numerical features.

    Task: 4
Implement logistic regression to predict loan approval and evaluate the model using accuracy and confusion matrix.


 Classifying Wine Quality Using Decision Trees

    Dataset:
Wine Quality Dataset

    Preprocessing Steps:
Handle missing values if any.

Standardize features.

Encode categorical variables if present.

    Task: 5
Implement a decision tree classifier to classify wine quality (good/bad) and evaluate the model using accuracy and ROC-AUC.
