In [221]:
# machine learning (classification)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
#import missingno as mns pip install missingno


In [222]:
df =pd.read_csv('credit_customers.csv')
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,...,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes,good
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,...,real estate,22.0,none,own,1.0,skilled,1.0,none,yes,bad
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,...,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes,good
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,...,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes,good
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,...,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes,bad


In [223]:
missing_columns_values = df.isnull().sum()
missing_columns_per = (df.isnull().sum() / len(df)) * 100
total_missing_values = pd.concat([missing_columns_values, missing_columns_per], axis =1, keys = ['missing_values', 'percentage'])
total_missing_values = total_missing_values.sort_values('percentage', ascending=False)
total_missing_values.head(20)

Unnamed: 0,missing_values,percentage
checking_status,0,0.0
duration,0,0.0
credit_history,0,0.0
purpose,0,0.0
credit_amount,0,0.0
savings_status,0,0.0
employment,0,0.0
installment_commitment,0,0.0
personal_status,0,0.0
other_parties,0,0.0


In [224]:
# next is to check the shape of the data
df.shape

(1000, 21)

In [225]:
df['class'].value_counts()

class
good    700
bad     300
Name: count, dtype: int64

In [226]:
# in classification, the margin in the class must not be too much 
# the margine is too much so we must carry out data balancing
from sklearn.utils import resample
df_good = df[df['class'] == 'good']
df_bad = df[df['class'] == 'bad']
df_sam = resample(df_bad, n_samples = 700)
df = pd.concat([df_good, df_sam], ignore_index=True)
df = df.sample(frac = 1)
df['class'].value_counts()



class
good    700
bad     700
Name: count, dtype: int64

In [None]:
# the next stage is preprocessing(convert cathegorical data to numeric)
encoder = LabelEncoder()
cathegorical_data = df.select_dtypes(include=['object', 'category'])
for x in cathegorical_data:
    df[x] = encoder.fit_transform(df[x])
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
41,3,18.0,3,9,3378.0,4,0,2.0,3,2,...,1,31.0,1,1,1.0,1,1.0,1,1,1
1283,1,18.0,3,6,2600.0,2,0,4.0,3,2,...,2,65.0,1,0,2.0,1,1.0,0,1,0
991,3,18.0,3,3,1533.0,2,2,4.0,2,0,...,1,43.0,1,1,1.0,3,2.0,0,1,0
557,3,9.0,3,3,2301.0,0,2,2.0,0,2,...,1,22.0,1,2,1.0,1,1.0,0,1,1
56,1,12.0,1,9,1526.0,2,3,4.0,3,2,...,2,66.0,1,0,2.0,0,1.0,0,1,1


In [228]:
# split to x and y
# x = all columns except column class, while y = only column class
x = df.drop('class', axis = 1)
y = df['class']
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size= 0.2, random_state= 1)

In [None]:
model1 = LogisticRegression(solver = 'newton-cholesky')
model1.fit(xtrain,ytrain)
# if there is error, hover on the logisticregression and pick any solver parameter

In [230]:
# prediction and evaluation
pred1 = model1.predict(xtest)
print('\nClassification Report\n', classification_report(ytest,pred1))
print('\nAccuracy Report\n', accuracy_score(ytest,pred1))
print('\nPrecision Report\n', precision_score(ytest, pred1))
print('\nRecall Report\n', recall_score(ytest,pred1))
print('\nF1 Score Report\n', f1_score(ytest,pred1))
print('\nConfusion Matrix Report\n', confusion_matrix(ytest,pred1))
# the outcome from this model is not good, note that 0 is bad and 1 is good


Classification Report
               precision    recall  f1-score   support

           0       0.65      0.68      0.66       136
           1       0.68      0.65      0.66       144

    accuracy                           0.66       280
   macro avg       0.66      0.66      0.66       280
weighted avg       0.67      0.66      0.66       280


Accuracy Report
 0.6642857142857143

Precision Report
 0.6838235294117647

Recall Report
 0.6458333333333334

F1 Score Report
 0.6642857142857143

Confusion Matrix Report
 [[93 43]
 [51 93]]


In [231]:
# using another model
model2= RandomForestClassifier()
model2.fit(xtrain,ytrain)

In [232]:
pred1 = model2.predict(xtest)
print('\nClassification Report\n', classification_report(ytest,pred1))
print('\nAccuracy Report\n', accuracy_score(ytest,pred1))
print('\nPrecision Report\n', precision_score(ytest, pred1))
print('\nRecall Report\n', recall_score(ytest,pred1))
print('\nF1 Score Report\n', f1_score(ytest,pred1))
print('\nConfusion Matrix Report\n', confusion_matrix(ytest,pred1))
# the out come from this model is good


Classification Report
               precision    recall  f1-score   support

           0       0.88      0.99      0.93       136
           1       0.98      0.87      0.92       144

    accuracy                           0.93       280
   macro avg       0.93      0.93      0.92       280
weighted avg       0.93      0.93      0.92       280


Accuracy Report
 0.925

Precision Report
 0.984251968503937

Recall Report
 0.8680555555555556

F1 Score Report
 0.922509225092251

Confusion Matrix Report
 [[134   2]
 [ 19 125]]


In [233]:
# using another model
model3= KNeighborsClassifier()
model3.fit(xtrain,ytrain)

In [234]:
pred1 = model3.predict(xtest)
print('\nClassification Report\n', classification_report(ytest,pred1))
print('\nAccuracy Report\n', accuracy_score(ytest,pred1))
print('\nPrecision Report\n', precision_score(ytest, pred1))
print('\nRecall Report\n', recall_score(ytest,pred1))
print('\nF1 Score Report\n', f1_score(ytest,pred1))
print('\nConfusion Matrix Report\n', confusion_matrix(ytest,pred1))
# the out come from this model is not good


Classification Report
               precision    recall  f1-score   support

           0       0.66      0.78      0.71       136
           1       0.75      0.62      0.68       144

    accuracy                           0.70       280
   macro avg       0.70      0.70      0.70       280
weighted avg       0.70      0.70      0.69       280


Accuracy Report
 0.6964285714285714

Precision Report
 0.7478991596638656

Recall Report
 0.6180555555555556

F1 Score Report
 0.6768060836501901

Confusion Matrix Report
 [[106  30]
 [ 55  89]]


In [235]:
# the next stage is deployment
# remember not to include the id number
x.iloc[[0]]

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
41,3,18.0,3,9,3378.0,4,0,2.0,3,2,1.0,1,31.0,1,1,1.0,1,1.0,1,1


In [236]:
model2.predict([[	0,	12.0,	1,	9,	1860.0,	2,	4,	4.0,	3,	2,	2.0,	0,	34.0,	1,	1,	2.0,	0,	1.0,	1,	1,]])



array([1])

In [237]:
df =pd.read_csv('horse.csv')
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101.0,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300.0,0.0,0,no
1,yes,adult,534817.0,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208.0,0.0,0,no
2,no,adult,530334.0,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0.0,0.0,0,yes
3,yes,young,5290409.0,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208.0,0.0,0,yes
4,no,adult,530255.0,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300.0,0.0,0,no


In [238]:
missing_columns_values = df.isnull().sum()
missing_columns_per = (df.isnull().sum() / len(df)) * 100
total_missing_values = pd.concat([missing_columns_values, missing_columns_per], axis =1, keys = ['missing_values', 'percentage'])
total_missing_values = total_missing_values.sort_values('percentage', ascending=False)
total_missing_values.head(20)

Unnamed: 0,missing_values,percentage
nasogastric_reflux_ph,246,82.274247
abdomo_protein,201,67.22408
abdomo_appearance,168,56.187291
abdomen,124,41.471572
nasogastric_reflux,109,36.454849
nasogastric_tube,107,35.785953
rectal_exam_feces,107,35.785953
peripheral_pulse,73,24.414716
rectal_temp,65,21.73913
respiratory_rate,64,21.404682


In [239]:
# fill in all missing value
numerical_data = df.select_dtypes(include= ['int', 'float'])
cathegorical_data = df.select_dtypes(include= ['object', 'category'])
for x in numerical_data:
    df[x].fillna(np.mean(df[x]), inplace = True)

for x in cathegorical_data:
        df[x].fillna(df[x].mode()[0], inplace = True)

df.isnull().sum()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[x].fillna(np.mean(df[x]), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[x].fillna(df[x].mode()[0], inplace = True)


surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
nasogastric_reflux_ph    0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
abdomo_protein           0
outcome                  0
surgical_lesion          0
lesion_1                 0
lesion_2                 0
lesion_3                 0
cp_data                  0
dtype: int64

In [240]:
df['outcome'].value_counts()

outcome
lived         182
died           76
euthanized     41
Name: count, dtype: int64

In [241]:
# in classification, the margin in the class must not be too much 
# the margine is too much so we must carry out data balancing
from sklearn.utils import resample
df_lived = df[df['outcome'] == 'lived']
df_died = df[df['outcome'] == 'died']
df_euthanized = df[df['outcome'] == 'euthanized']
df_sam = resample(df_died, n_samples = 182)
df_sam1 = resample(df_euthanized, n_samples = 182)
df = pd.concat([df_lived, df_sam, df_sam1], ignore_index=True)
df = df.sample(frac = 1)
df['outcome'].value_counts()


outcome
died          182
euthanized    182
lived         182
Name: count, dtype: int64

In [242]:
# preprocessing

encoder = LabelEncoder()
cathegorical_data = df.select_dtypes(include=['object', 'category'])
for x in cathegorical_data:
    df[x] = encoder.fit_transform(df[x])
df.head()

#run the model but there is an error which you must rectify
# y should be outcome

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
274,1,0,527933.0,36.8,60.0,28.0,1,2,3,1,...,46.291667,23.947126,1,10.0,0,1,3205.0,0.0,0,1
381,0,0,528812.0,38.18547,104.0,24.0,0,3,5,2,...,73.0,8.4,1,3.054082,1,1,7111.0,0.0,0,0
428,0,0,528006.0,38.18547,40.0,16.0,1,2,3,1,...,50.0,7.0,1,3.9,1,1,2208.0,0.0,0,1
279,1,0,529399.0,39.3,71.95539,30.455319,0,3,2,2,...,75.0,23.947126,2,4.3,0,1,2207.0,0.0,0,1
528,1,0,534624.0,38.18547,76.0,30.455319,1,3,3,1,...,46.291667,23.947126,1,3.054082,1,1,11124.0,0.0,0,0


In [243]:
# split to x and y
x = df.drop('outcome', axis = 1)
y = df['outcome']
xtrain,xtest,ytrain,ytest = train_test_split(x,y, test_size= 0.2, random_state = 1)

In [244]:
# create and test the model
model1 = LogisticRegression()
model1.fit(xtrain,ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [245]:
# prediction and evaluation
pred1 = model1.predict(xtest)
print('\nClassification Report\n', classification_report(ytest,pred1))
print('\nAccuracy Report\n', accuracy_score(ytest,pred1))
print('\nPrecision Report\n', precision_score(ytest, pred1))
print('\nRecall Report\n', recall_score(ytest,pred1))
print('\nF1 Score Report\n', f1_score(ytest,pred1))
print('\nConfusion Matrix Report\n', confusion_matrix(ytest,pred1))


Classification Report
               precision    recall  f1-score   support

           0       0.57      0.30      0.39        40
           1       0.34      0.91      0.50        33
           2       1.00      0.03      0.05        37

    accuracy                           0.39       110
   macro avg       0.64      0.41      0.31       110
weighted avg       0.65      0.39      0.31       110


Accuracy Report
 0.39090909090909093


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [167]:
# using another model
model2= RandomForestClassifier()
model2.fit(xtrain,ytrain)

In [168]:
# prediction and evaluation
pred2 = model2.predict(xtest)
print('\nClassification Report\n', classification_report(ytest,pred2))
print('\nAccuracy Report\n', accuracy_score(ytest,pred2))
print('\nPrecision Report\n', precision_score(ytest, pred2))
print('\nRecall Report\n', recall_score(ytest,pred2))
print('\nF1 Score Report\n', f1_score(ytest,pred2))
print('\nConfusion Matrix Report\n', confusion_matrix(ytest,pred2))


Classification Report
               precision    recall  f1-score   support

           0       0.89      0.97      0.93        34
           1       0.88      1.00      0.94        36
           2       1.00      0.80      0.89        40

    accuracy                           0.92       110
   macro avg       0.92      0.92      0.92       110
weighted avg       0.93      0.92      0.92       110


Accuracy Report
 0.9181818181818182


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [169]:
# using another model
model3= KNeighborsClassifier()
model3.fit(xtrain,ytrain)

In [170]:
# prediction and evaluation
pred3 = model3.predict(xtest)
print('\nClassification Report\n', classification_report(ytest,pred3))
print('\nAccuracy Report\n', accuracy_score(ytest,pred3))
print('\nPrecision Report\n', precision_score(ytest, pred3))
print('\nRecall Report\n', recall_score(ytest,pred3))
print('\nF1 Score Report\n', f1_score(ytest,pred3))
print('\nConfusion Matrix Report\n', confusion_matrix(ytest,pred3))


Classification Report
               precision    recall  f1-score   support

           0       0.78      0.85      0.82        34
           1       0.75      1.00      0.86        36
           2       0.84      0.53      0.65        40

    accuracy                           0.78       110
   macro avg       0.79      0.79      0.77       110
weighted avg       0.79      0.78      0.77       110


Accuracy Report
 0.7818181818181819


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].