In [48]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [49]:
df=pd.read_csv(r'training.csv')
df.head()

Unnamed: 0,Row_ID,Financial_Year,Branch_Code,Sequence_1,Sequence_2,Donation_type,Donor_Age,Donation_Date,Gender,Blood_Group_Code,Donor_Weight,Donor_Temperature,Donor_Pulse,Donor_Hemoglobin,Donor_Blood_Pressure,Test_1,Test_2,Test_3,Test_4,Y
0,202961,15,BRC-2,16281.0,15682,R,23.0,20-May-15,M,5.0,,,,,,0.132,0.192,N,N,0
1,43390,15,BRC-1,1528.0,294,T,20.0,4-Jul-14,M,5.0,82.0,37.0,72.0,14.0,120/80,0.649,0.724,N,N,0
2,102493,16,BRC-1,27502.0,25159,T,38.0,22-Feb-16,M,7.0,80.0,37.0,72.0,13.0,120/80,0.082,0.202,N,N,0
3,122646,17,BRC-1,8252.0,7796,R,25.0,29-Aug-16,M,1.0,60.0,37.0,78.0,13.0,120/80,0.144,0.183,N,N,0
4,58944,15,BRC-1,17715.0,16483,R,22.0,28-Nov-14,M,7.0,75.0,37.0,72.0,14.0,120/80,0.424,0.303,N,N,0


In [50]:
#Let's print the last 5 records of the dataset
df.tail()

Unnamed: 0,Row_ID,Financial_Year,Branch_Code,Sequence_1,Sequence_2,Donation_type,Donor_Age,Donation_Date,Gender,Blood_Group_Code,Donor_Weight,Donor_Temperature,Donor_Pulse,Donor_Hemoglobin,Donor_Blood_Pressure,Test_1,Test_2,Test_3,Test_4,Y
164594,118714,17,BRC-1,4189.0,3734,R,25.0,28-Jul-16,M,7.0,70.0,37.0,72.0,12.0,120/80,0.141,0.254,N,N,0
164595,827,14,BRC-1,78613.0,834,T,22.0,8-Jul-13,M,3.0,76.0,37.0,72.0,13.0,120/80,0.184,0.262,N,N,0
164596,21172,14,BRC-1,99461.0,21682,R,26.0,5-Dec-13,M,5.0,55.0,37.0,72.0,12.0,120/80,0.231,0.333,N,N,0
164597,16110,14,BRC-1,94215.0,16436,R,34.0,27-Oct-13,M,5.0,80.0,37.0,78.0,12.0,120/80,0.152,0.252,N,N,0
164598,202742,15,BRC-2,16059.0,15870,N,,24-May-15,M,8.0,,,,,,0.443,0.233,N,N,0


In [51]:
#Let's print the column name
df.columns

Index(['Row_ID', 'Financial_Year', 'Branch_Code', 'Sequence_1', 'Sequence_2',
       'Donation_type', 'Donor_Age', 'Donation_Date', 'Gender',
       'Blood_Group_Code', 'Donor_Weight', 'Donor_Temperature', 'Donor_Pulse',
       'Donor_Hemoglobin', 'Donor_Blood_Pressure', 'Test_1', 'Test_2',
       'Test_3', 'Test_4', 'Y'],
      dtype='object')

In [52]:
# shape of the dataset
df.shape

(164599, 20)

In [53]:
df.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
164594    False
164595    False
164596    False
164597    False
164598    False
Length: 164599, dtype: bool

In [54]:
#Let's print the total number of duplicated value
df.duplicated().sum()

0

In [55]:
#Let's print the unique values in our dataset
df.nunique()

Row_ID                  164599
Financial_Year               5
Branch_Code                  6
Sequence_1               72268
Sequence_2               42989
Donation_type                7
Donor_Age                 2053
Donation_Date             1851
Gender                       2
Blood_Group_Code            13
Donor_Weight               213
Donor_Temperature           62
Donor_Pulse                 95
Donor_Hemoglobin           123
Donor_Blood_Pressure       487
Test_1                     633
Test_2                     920
Test_3                       3
Test_4                       1
Y                            2
dtype: int64

In [56]:
#Let's print the data types in our data
df.dtypes

Row_ID                    int64
Financial_Year            int64
Branch_Code              object
Sequence_1              float64
Sequence_2                int64
Donation_type            object
Donor_Age               float64
Donation_Date            object
Gender                   object
Blood_Group_Code        float64
Donor_Weight            float64
Donor_Temperature       float64
Donor_Pulse             float64
Donor_Hemoglobin        float64
Donor_Blood_Pressure     object
Test_1                  float64
Test_2                  float64
Test_3                   object
Test_4                   object
Y                         int64
dtype: object

In [57]:
#descriptive statistics summary
df.describe

<bound method NDFrame.describe of         Row_ID  Financial_Year Branch_Code  Sequence_1  Sequence_2  \
0       202961              15       BRC-2     16281.0       15682   
1        43390              15       BRC-1      1528.0         294   
2       102493              16       BRC-1     27502.0       25159   
3       122646              17       BRC-1      8252.0        7796   
4        58944              15       BRC-1     17715.0       16483   
...        ...             ...         ...         ...         ...   
164594  118714              17       BRC-1      4189.0        3734   
164595     827              14       BRC-1     78613.0         834   
164596   21172              14       BRC-1     99461.0       21682   
164597   16110              14       BRC-1     94215.0       16436   
164598  202742              15       BRC-2     16059.0       15870   

       Donation_type  Donor_Age Donation_Date Gender  Blood_Group_Code  \
0                  R       23.0     20-May-15      

In [58]:
#Let's get the overall information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164599 entries, 0 to 164598
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Row_ID                164599 non-null  int64  
 1   Financial_Year        164599 non-null  int64  
 2   Branch_Code           164599 non-null  object 
 3   Sequence_1            164599 non-null  float64
 4   Sequence_2            164599 non-null  int64  
 5   Donation_type         164599 non-null  object 
 6   Donor_Age             152553 non-null  float64
 7   Donation_Date         164599 non-null  object 
 8   Gender                164591 non-null  object 
 9   Blood_Group_Code      164242 non-null  float64
 10  Donor_Weight          140303 non-null  float64
 11  Donor_Temperature     140300 non-null  float64
 12  Donor_Pulse           140300 non-null  float64
 13  Donor_Hemoglobin      140300 non-null  float64
 14  Donor_Blood_Pressure  140299 non-null  object 
 15  

In [59]:
#Let's check the total number of null values present in our dataset
df.isnull().sum()

Row_ID                      0
Financial_Year              0
Branch_Code                 0
Sequence_1                  0
Sequence_2                  0
Donation_type               0
Donor_Age               12046
Donation_Date               0
Gender                      8
Blood_Group_Code          357
Donor_Weight            24296
Donor_Temperature       24299
Donor_Pulse             24299
Donor_Hemoglobin        24299
Donor_Blood_Pressure    24300
Test_1                     57
Test_2                     24
Test_3                      8
Test_4                      8
Y                           0
dtype: int64

In [60]:
df.isnull().sum().sum()

134001

In [61]:
for column in df.columns:
    # Check if the column is numeric
    if df[column].dtype in ['float64', 'int64']:
        df[column].fillna(df[column].mean(), inplace=True)
    else:
        # For non-numeric columns, use mode
        df[column].fillna(df[column].mode()[0], inplace=True)

In [62]:
from sklearn.preprocessing import LabelEncoder

# Initialize label encoder
label_encoder = LabelEncoder()

# Applying label encoding to each column of type 'object'
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = label_encoder.fit_transform(df[column])

df.head()

Unnamed: 0,Row_ID,Financial_Year,Branch_Code,Sequence_1,Sequence_2,Donation_type,Donor_Age,Donation_Date,Gender,Blood_Group_Code,Donor_Weight,Donor_Temperature,Donor_Pulse,Donor_Hemoglobin,Donor_Blood_Pressure,Test_1,Test_2,Test_3,Test_4,Y
0,202961,15,1,16281.0,15682,5,23.0,771,1,5.0,73.724061,38.045429,74.586521,13.332521,184,0.132,0.192,0,0,0
1,43390,15,0,1528.0,294,6,20.0,1508,1,5.0,82.0,37.0,72.0,14.0,184,0.649,0.724,0,0,0
2,102493,16,0,27502.0,25159,6,38.0,867,1,7.0,80.0,37.0,72.0,13.0,184,0.082,0.202,0,0,0
3,122646,17,0,8252.0,7796,5,25.0,1282,1,1.0,60.0,37.0,78.0,13.0,184,0.144,0.183,0,0,0
4,58944,15,0,17715.0,16483,5,22.0,1260,1,7.0,75.0,37.0,72.0,14.0,184,0.424,0.303,0,0,0


In [63]:
df.drop(['Row_ID'],axis=1,inplace=True)

In [64]:
df.isnull().sum().sum()

0

In [65]:
df["Y"].value_counts()

Y
0    161740
1      2859
Name: count, dtype: int64

In [66]:
x=df.drop('Y',axis=1)
x

Unnamed: 0,Financial_Year,Branch_Code,Sequence_1,Sequence_2,Donation_type,Donor_Age,Donation_Date,Gender,Blood_Group_Code,Donor_Weight,Donor_Temperature,Donor_Pulse,Donor_Hemoglobin,Donor_Blood_Pressure,Test_1,Test_2,Test_3,Test_4
0,15,1,16281.0,15682,5,23.000000,771,1,5.0,73.724061,38.045429,74.586521,13.332521,184,0.132,0.192,0,0
1,15,0,1528.0,294,6,20.000000,1508,1,5.0,82.000000,37.000000,72.000000,14.000000,184,0.649,0.724,0,0
2,16,0,27502.0,25159,6,38.000000,867,1,7.0,80.000000,37.000000,72.000000,13.000000,184,0.082,0.202,0,0
3,17,0,8252.0,7796,5,25.000000,1282,1,1.0,60.000000,37.000000,78.000000,13.000000,184,0.144,0.183,0,0
4,15,0,17715.0,16483,5,22.000000,1260,1,7.0,75.000000,37.000000,72.000000,14.000000,184,0.424,0.303,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164594,17,0,4189.0,3734,5,25.000000,1241,1,7.0,70.000000,37.000000,72.000000,12.000000,184,0.141,0.254,0,0
164595,14,0,78613.0,834,6,22.000000,1751,1,3.0,76.000000,37.000000,72.000000,13.000000,184,0.184,0.262,0,0
164596,14,0,99461.0,21682,5,26.000000,1554,1,5.0,55.000000,37.000000,72.000000,12.000000,184,0.231,0.333,0,0
164597,14,0,94215.0,16436,5,34.000000,1203,1,5.0,80.000000,37.000000,78.000000,12.000000,184,0.152,0.252,0,0


In [67]:
y=df['Y']
y

0         0
1         0
2         0
3         0
4         0
         ..
164594    0
164595    0
164596    0
164597    0
164598    0
Name: Y, Length: 164599, dtype: int64

In [68]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=101)

In [69]:
x_train

Unnamed: 0,Financial_Year,Branch_Code,Sequence_1,Sequence_2,Donation_type,Donor_Age,Donation_Date,Gender,Blood_Group_Code,Donor_Weight,Donor_Temperature,Donor_Pulse,Donor_Hemoglobin,Donor_Blood_Pressure,Test_1,Test_2,Test_3,Test_4
75807,16,0,28820.0,26477,6,28.000000,1765,1,5.0,70.000000,37.000000,72.000000,13.000000,184,0.133,0.213,0,0
40896,18,0,26373.0,25586,5,40.000000,1548,1,4.0,80.000000,37.000000,72.000000,12.000000,184,0.262,0.160,0,0
135448,17,0,18763.0,18307,5,30.000000,838,1,3.0,70.000000,37.000000,72.000000,13.000000,184,0.211,0.181,0,0
28048,17,0,20340.0,19884,5,25.000000,1557,1,5.0,76.000000,37.000000,72.000000,13.000000,184,0.114,0.201,0,0
55414,17,0,33624.0,33168,5,31.000000,793,1,7.0,95.000000,37.000000,72.000000,13.000000,184,0.272,0.224,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148429,16,0,24051.0,21708,5,34.000000,387,1,2.0,80.000000,37.000000,72.000000,13.000000,184,0.324,0.214,0,0
151535,16,0,6287.0,3943,6,27.000000,69,1,6.0,93.000000,37.000000,72.000000,14.000000,184,0.450,0.200,0,0
55293,16,0,9190.0,6846,2,32.000000,1602,1,1.0,80.000000,37.000000,72.000000,13.000000,184,0.203,0.252,0,0
49751,14,0,80017.0,2238,5,36.000000,755,1,3.0,55.000000,37.000000,72.000000,13.000000,184,0.093,0.243,0,0


In [70]:
y_train

75807     0
40896     0
135448    0
28048     0
55414     0
         ..
148429    0
151535    0
55293     0
49751     0
136767    0
Name: Y, Length: 131679, dtype: int64

In [71]:
x_test

Unnamed: 0,Financial_Year,Branch_Code,Sequence_1,Sequence_2,Donation_type,Donor_Age,Donation_Date,Gender,Blood_Group_Code,Donor_Weight,Donor_Temperature,Donor_Pulse,Donor_Hemoglobin,Donor_Blood_Pressure,Test_1,Test_2,Test_3,Test_4
9383,17,0,29176.0,28720,5,21.0,1828,1,3.0,75.0,37.0,72.0,14.0,184,0.133,0.214,0,0
3311,16,0,6666.0,4322,5,23.0,252,1,5.0,78.0,37.0,72.0,13.0,184,0.371,0.202,0,0
60034,15,0,19889.0,18657,4,48.0,861,1,3.0,90.0,37.0,75.0,13.0,184,0.303,0.211,0,0
116580,17,0,40072.0,39616,6,36.0,580,1,1.0,75.0,37.0,72.0,12.0,184,0.144,0.223,0,0
141563,16,0,37032.0,34689,6,21.0,1312,1,3.0,68.0,37.0,72.0,13.0,184,0.184,0.212,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6365,17,0,40020.0,39564,5,32.0,580,1,3.0,70.0,37.0,72.0,13.0,184,0.084,0.194,0,0
65664,17,0,39046.0,38590,5,32.0,95,1,5.0,72.0,37.0,72.0,14.0,184,0.101,0.212,0,0
75721,17,0,10718.0,10262,5,35.0,665,1,1.0,70.0,37.0,78.0,13.0,184,0.443,0.212,0,0
123576,18,0,26660.0,25873,6,25.0,1670,1,3.0,70.0,37.0,72.0,13.0,184,0.183,0.100,0,0


In [72]:
y_test

9383      0
3311      0
60034     0
116580    0
141563    0
         ..
6365      0
65664     0
75721     0
123576    1
159021    0
Name: Y, Length: 32920, dtype: int64

In [73]:
x_train.to_csv('final.csv')

In [74]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
# Initialize the Decision Tree classifier
dtc = DecisionTreeClassifier()
# Fit the classifier to the training data
dtc.fit(x_train, y_train)
# Predict on the test data
y_pred_dtc = dtc.predict(x_test)
# Calculate accuracy, precision, recall, and F1-score
acc_dtc = accuracy_score(y_test, y_pred_dtc)
precision_dtc = precision_score(y_test, y_pred_dtc)
recall_dtc = recall_score(y_test, y_pred_dtc)
f1_score_dtc = f1_score(y_test, y_pred_dtc)
# Print accuracy, precision, recall, and F1-score
print("Decision Tree Classifier Metrics:")
print("Accuracy:", acc_dtc)
print("Precision:", precision_dtc)
print("Recall:", recall_dtc)
print("F1 Score:", f1_score_dtc)
# Generate and print the classification report
classification_rep_dtc = classification_report(y_test, y_pred_dtc)
print("Classification Report:\n", classification_rep_dtc)

Decision Tree Classifier Metrics:
Accuracy: 0.9613304981773998
Precision: 0.021739130434782608
Recall: 0.024469820554649267
F1 Score: 0.023023791250959325
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98     32307
           1       0.02      0.02      0.02       613

    accuracy                           0.96     32920
   macro avg       0.50      0.50      0.50     32920
weighted avg       0.96      0.96      0.96     32920



In [75]:
import pickle
filename = 'decision.sav'
pickle.dump(dtc, open(filename, 'wb'))

In [76]:
model = pickle.load(open(filename, 'rb'))

In [77]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
# Initialize the Random Forest classifier
rfc = RandomForestClassifier()
# Fit the classifier to the training data
rfc.fit(x_train, y_train)
# Predict on the test data
y_pred_rfc = rfc.predict(x_test)
# Calculate accuracy, precision, recall, and F1-score
acc_rfc = accuracy_score(y_test, y_pred_rfc)
precision_rfc = precision_score(y_test, y_pred_rfc)
recall_rfc = recall_score(y_test, y_pred_rfc)
f1_score_rfc = f1_score(y_test, y_pred_rfc)
# Print accuracy, precision, recall, and F1-score
print("Random Forest Classifier Metrics:")
print("Accuracy:", acc_rfc)
print("Precision:", precision_rfc)
print("Recall:", recall_rfc)
print("F1 Score:", f1_score_rfc)
# Generate and print the classification report
classification_rep_rfc = classification_report(y_test, y_pred_rfc)
print("Classification Report:\n", classification_rep_rfc)

Random Forest Classifier Metrics:
Accuracy: 0.981318347509113
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99     32307
           1       0.00      0.00      0.00       613

    accuracy                           0.98     32920
   macro avg       0.49      0.50      0.50     32920
weighted avg       0.96      0.98      0.97     32920



In [78]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
# Initialize the Logistic Regression classifier
log_reg = LogisticRegression()
# Fit the classifier to the training data
log_reg.fit(x_train, y_train)
# Predict on the test data
y_pred_log_reg = log_reg.predict(x_test)
# Calculate accuracy, precision, recall, and F1-score
acc_log_reg = accuracy_score(y_test, y_pred_log_reg)
precision_log_reg = precision_score(y_test, y_pred_log_reg)
recall_log_reg = recall_score(y_test, y_pred_log_reg)
f1_score_log_reg = f1_score(y_test, y_pred_log_reg)
# Print accuracy, precision, recall, and F1-score
print("Logistic Regression Metrics:")
print("Accuracy:", acc_log_reg)
print("Precision:", precision_log_reg)
print("Recall:", recall_log_reg)
print("F1 Score:", f1_score_log_reg)
# Generate and print the classification report
classification_rep_log_reg = classification_report(y_test, y_pred_log_reg)
print("Classification Report:\n", classification_rep_log_reg)

Logistic Regression Metrics:
Accuracy: 0.981318347509113
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99     32307
           1       0.00      0.00      0.00       613

    accuracy                           0.98     32920
   macro avg       0.49      0.50      0.50     32920
weighted avg       0.96      0.98      0.97     32920



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [79]:
abc=[16,	0,	28820,	26477,	6,	28	,1765,	1,	5	,70	,37	,72	,13,	184,	0.133,	0.213	,0	,0]
result=dtc.predict([abc])
result=result[0]
if result==0:
    print("The Person is not Diseased")
else:
    print("The Person is Diseased")

The Person is not Diseased


