# Loading Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import svm, tree

from sklearn.metrics import confusion_matrix,multilabel_confusion_matrix,classification_report
from sklearn.metrics import precision_recall_curve,plot_precision_recall_curve,plot_roc_curve
from sklearn.metrics import accuracy_score,f1_score


import warnings
warnings.filterwarnings('ignore')

# Loading Train and Test Dataset

In [2]:
pharma_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/pharma_data/Training_set_begs.csv')
test_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/pharma_data/Testing_set_begs.csv')

# Performing EDA on Train and Test Dataset

### EDA on Train Dataset

In [3]:
pharma_data

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,22374,8,3333,DX6,56,18.479385,YES,URBAN,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
1,18164,5,5740,DX2,36,22.945566,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
2,6283,23,10446,DX6,48,27.510027,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,5339,51,12011,DX1,5,19.130976,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,33012,0,12513,,128,1.348400,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23092,15613,3,1548,DX2 DX4,14,18.643448,NO,RURAL,Stable,1.0,1.0,0.0,1.0,1.0,1.0,0.0,5.0,0
23093,15503,41,2769,DX6,55,23.684585,NO,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
23094,2167,24,7671,DX6,63,27.500039,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
23095,31646,11,5764,DX3,25,23.719125,YES,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1


In [3]:
pharma_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23097 entries, 0 to 23096
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  23097 non-null  int64  
 1   Diagnosed_Condition        23097 non-null  int64  
 2   Patient_ID                 23097 non-null  int64  
 3   Treated_with_drugs         23084 non-null  object 
 4   Patient_Age                23097 non-null  int64  
 5   Patient_Body_Mass_Index    23097 non-null  float64
 6   Patient_Smoker             23097 non-null  object 
 7   Patient_Rural_Urban        23097 non-null  object 
 8   Patient_mental_condition   23097 non-null  object 
 9   A                          21862 non-null  float64
 10  B                          21862 non-null  float64
 11  C                          21862 non-null  float64
 12  D                          21862 non-null  float64
 13  E                          21862 non-null  flo

In [4]:
pharma_data.describe()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Patient_Age,Patient_Body_Mass_Index,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
count,23097.0,23097.0,23097.0,23097.0,23097.0,21862.0,21862.0,21862.0,21862.0,21862.0,21862.0,21862.0,21862.0,23097.0
mean,16545.712041,26.413127,6261.280772,33.209768,23.45482,0.897905,0.136355,0.18507,0.083615,0.393239,0.0537,0.000595,1.75048,0.632247
std,9532.263503,15.030865,3595.99062,19.549882,3.807661,0.30278,0.343173,0.388363,0.276817,0.48848,0.225431,0.024379,0.770311,0.482204
min,2.0,0.0,1.0,0.0,1.0893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,8280.0,13.0,3181.0,16.0,20.20555,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,16597.0,26.0,6242.0,33.0,23.386199,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
75%,24825.0,39.0,9363.0,50.0,26.788154,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0
max,33014.0,52.0,12515.0,149.0,29.999579,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,1.0


### Changing Data type on 4 columns (Object to Intenger)

In [5]:
le = LabelEncoder()

In [6]:
pharma_data.Treated_with_drugs = le.fit_transform(pharma_data.Treated_with_drugs)
pharma_data.Patient_Smoker = le.fit_transform(pharma_data.Patient_Smoker)
pharma_data.Patient_Rural_Urban = le.fit_transform(pharma_data.Patient_Rural_Urban)
pharma_data.Patient_mental_condition = le.fit_transform(pharma_data.Patient_mental_condition)

In [7]:
pharma_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23097 entries, 0 to 23096
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  23097 non-null  int64  
 1   Diagnosed_Condition        23097 non-null  int64  
 2   Patient_ID                 23097 non-null  int64  
 3   Treated_with_drugs         23097 non-null  int64  
 4   Patient_Age                23097 non-null  int64  
 5   Patient_Body_Mass_Index    23097 non-null  float64
 6   Patient_Smoker             23097 non-null  int64  
 7   Patient_Rural_Urban        23097 non-null  int64  
 8   Patient_mental_condition   23097 non-null  int64  
 9   A                          21862 non-null  float64
 10  B                          21862 non-null  float64
 11  C                          21862 non-null  float64
 12  D                          21862 non-null  float64
 13  E                          21862 non-null  flo

### Checking for duplicates

##### We found no duplicates

In [8]:
phd = pharma_data[pharma_data.duplicated()]

In [9]:
phd.shape

(0, 18)

### Checking for Null Values

##### We found some null values. In this case , we drop all null values

In [10]:
pharma_data.isna().sum()

ID_Patient_Care_Situation       0
Diagnosed_Condition             0
Patient_ID                      0
Treated_with_drugs              0
Patient_Age                     0
Patient_Body_Mass_Index         0
Patient_Smoker                  0
Patient_Rural_Urban             0
Patient_mental_condition        0
A                            1235
B                            1235
C                            1235
D                            1235
E                            1235
F                            1235
Z                            1235
Number_of_prev_cond          1235
Survived_1_year                 0
dtype: int64

In [11]:
phard = pharma_data.dropna()

In [12]:
phard.isna().sum()

ID_Patient_Care_Situation    0
Diagnosed_Condition          0
Patient_ID                   0
Treated_with_drugs           0
Patient_Age                  0
Patient_Body_Mass_Index      0
Patient_Smoker               0
Patient_Rural_Urban          0
Patient_mental_condition     0
A                            0
B                            0
C                            0
D                            0
E                            0
F                            0
Z                            0
Number_of_prev_cond          0
Survived_1_year              0
dtype: int64

In [13]:
phard

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,22374,8,3333,31,56,18.479385,2,1,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
1,18164,5,5740,16,36,22.945566,2,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
2,6283,23,10446,31,48,27.510027,2,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,5339,51,12011,0,5,19.130976,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,33012,0,12513,32,128,1.348400,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23092,15613,3,1548,21,14,18.643448,1,0,0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,5.0,0
23093,15503,41,2769,31,55,23.684585,1,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
23094,2167,24,7671,31,63,27.500039,2,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
23095,31646,11,5764,24,25,23.719125,2,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1


In [14]:
phard.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21862 entries, 0 to 23096
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  21862 non-null  int64  
 1   Diagnosed_Condition        21862 non-null  int64  
 2   Patient_ID                 21862 non-null  int64  
 3   Treated_with_drugs         21862 non-null  int64  
 4   Patient_Age                21862 non-null  int64  
 5   Patient_Body_Mass_Index    21862 non-null  float64
 6   Patient_Smoker             21862 non-null  int64  
 7   Patient_Rural_Urban        21862 non-null  int64  
 8   Patient_mental_condition   21862 non-null  int64  
 9   A                          21862 non-null  float64
 10  B                          21862 non-null  float64
 11  C                          21862 non-null  float64
 12  D                          21862 non-null  float64
 13  E                          21862 non-null  flo

### EDA on Test Dataset

### Checking for Null Values

##### We found no null valuess

In [15]:
test_data.isna().sum()

ID_Patient_Care_Situation    0
Diagnosed_Condition          0
Patient_ID                   0
Treated_with_drugs           0
Patient_Age                  0
Patient_Body_Mass_Index      0
Patient_Smoker               0
Patient_Rural_Urban          0
Patient_mental_condition     0
A                            0
B                            0
C                            0
D                            0
E                            0
F                            0
Z                            0
Number_of_prev_cond          0
dtype: int64

In [16]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9303 entries, 0 to 9302
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  9303 non-null   int64  
 1   Diagnosed_Condition        9303 non-null   int64  
 2   Patient_ID                 9303 non-null   int64  
 3   Treated_with_drugs         9303 non-null   object 
 4   Patient_Age                9303 non-null   int64  
 5   Patient_Body_Mass_Index    9303 non-null   float64
 6   Patient_Smoker             9303 non-null   object 
 7   Patient_Rural_Urban        9303 non-null   object 
 8   Patient_mental_condition   9303 non-null   object 
 9   A                          9303 non-null   float64
 10  B                          9303 non-null   float64
 11  C                          9303 non-null   float64
 12  D                          9303 non-null   float64
 13  E                          9303 non-null   float

### Changing Data type on 4 columns (Object to Intenger)

In [17]:
test_data.Treated_with_drugs = le.fit_transform(test_data.Treated_with_drugs)
test_data.Patient_Smoker = le.fit_transform(test_data.Patient_Smoker)
test_data.Patient_Rural_Urban = le.fit_transform(test_data.Patient_Rural_Urban)
test_data.Patient_mental_condition = le.fit_transform(test_data.Patient_mental_condition)

In [18]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9303 entries, 0 to 9302
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  9303 non-null   int64  
 1   Diagnosed_Condition        9303 non-null   int64  
 2   Patient_ID                 9303 non-null   int64  
 3   Treated_with_drugs         9303 non-null   int64  
 4   Patient_Age                9303 non-null   int64  
 5   Patient_Body_Mass_Index    9303 non-null   float64
 6   Patient_Smoker             9303 non-null   int64  
 7   Patient_Rural_Urban        9303 non-null   int64  
 8   Patient_mental_condition   9303 non-null   int64  
 9   A                          9303 non-null   float64
 10  B                          9303 non-null   float64
 11  C                          9303 non-null   float64
 12  D                          9303 non-null   float64
 13  E                          9303 non-null   float

### Checking for duplicates

##### We found no duplicates

In [19]:
te = test_data[test_data.duplicated()]

In [20]:
te.shape

(0, 17)

In [21]:
test_data

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond
0,19150,40,3709,24,16,29.443894,0,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
1,23216,52,986,31,24,26.836321,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
2,11890,50,11821,29,63,25.523280,0,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
3,7149,32,3292,31,42,27.171155,0,1,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0
4,22845,20,9959,24,50,25.556192,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9298,18245,11,9299,31,28,29.106314,0,0,0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0
9299,15598,7,6273,0,4,20.616673,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
9300,11885,16,11473,11,20,24.727357,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9301,25101,50,5681,28,33,17.517426,0,1,0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,5.0


In [22]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9303 entries, 0 to 9302
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  9303 non-null   int64  
 1   Diagnosed_Condition        9303 non-null   int64  
 2   Patient_ID                 9303 non-null   int64  
 3   Treated_with_drugs         9303 non-null   int64  
 4   Patient_Age                9303 non-null   int64  
 5   Patient_Body_Mass_Index    9303 non-null   float64
 6   Patient_Smoker             9303 non-null   int64  
 7   Patient_Rural_Urban        9303 non-null   int64  
 8   Patient_mental_condition   9303 non-null   int64  
 9   A                          9303 non-null   float64
 10  B                          9303 non-null   float64
 11  C                          9303 non-null   float64
 12  D                          9303 non-null   float64
 13  E                          9303 non-null   float

# Model Training Process

### Dropping our Prediction Column 

In [23]:
X = phard.drop('Survived_1_year', axis=1)
y = phard['Survived_1_year']

### Spliting our dataset

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [34]:
X_train.shape

(15303, 17)

### Scaling our Dataset

In [35]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Creating training model

In [36]:
model = svm.LinearSVC(random_state=0, tol=1e-5)

### Performing model traning and calculating training score

In [37]:
model.fit(X_train,y_train)

LinearSVC(random_state=0, tol=1e-05)

In [38]:
print(model.score(X_test,y_test))

0.6981247141332521


### Calculating our Accuracy score

In [39]:
y_pred = model.predict(X_test)
print('Accuracy Score is: ', accuracy_score(y_test,y_pred))

Accuracy Score is:  0.6981247141332521


### Creating Confusion Matrix 

In [40]:
cm = confusion_matrix(y_test,y_pred)

### Printing Classification Report 

In [41]:
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.63      0.46      0.53      2423
           1       0.72      0.84      0.78      4136

    accuracy                           0.70      6559
   macro avg       0.68      0.65      0.65      6559
weighted avg       0.69      0.70      0.69      6559



# Create predicion .csv file

### Putting the target prediction

In [42]:
target = model.predict(test_data)

### Creating prediction file

In [44]:
res = pd.DataFrame(target) #target is nothing but the final predictions of your model on input features of your new unseen test data
res.columns = ["prediction"]
res.to_csv("submission.csv", index = False)      # the csv file will be saved locally on the same location where this notebook is located.