In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    precision_score,
    recall_score,
    roc_auc_score,
    confusion_matrix,
    mean_absolute_error
)
import numpy as np
import matplotlib.pyplot as plt

In [29]:
import sklearn
import matplotlib
import pickle

In [2]:
df=pd.read_csv("dataset.csv",encoding="latin1")

In [31]:
print('The scikit-learn version is {}.'.format(sklearn.__version__))
print('The numpy version is {}.'.format(np.__version__))
print('The matplotlib version is {}.'.format(matplotlib.__version__))
print('The pandas version is {}.'.format(pd.__version__))
print(pickle.format_version)

The scikit-learn version is 0.22.
The numpy version is 1.17.2.
The matplotlib version is 3.1.1.
The pandas version is 0.25.1.
4.0


In [3]:
df.head()

Unnamed: 0,Patient ID,Patient age quantile,SARS-Cov-2 exam result,"Patient addmited to regular ward (1=yes, 0=no)","Patient addmited to semi-intensive unit (1=yes, 0=no)","Patient addmited to intensive care unit (1=yes, 0=no)",Hematocrit,Hemoglobin,Platelets,Mean platelet volume,...,Hb saturation (arterial blood gases),pCO2 (arterial blood gas analysis),Base excess (arterial blood gas analysis),pH (arterial blood gas analysis),Total CO2 (arterial blood gas analysis),HCO3 (arterial blood gas analysis),pO2 (arterial blood gas analysis),Arteiral Fio2,Phosphor,ctO2 (arterial blood gas analysis)
0,44477f75e8169d2,13,negative,0,0,0,,,,,...,,,,,,,,,,
1,126e9dd13932f68,17,negative,0,0,0,0.236515,-0.02234,-0.517413,0.010677,...,,,,,,,,,,
2,a46b4402a0e5696,8,negative,0,0,0,,,,,...,,,,,,,,,,
3,f7d619a94f97c45,5,negative,0,0,0,,,,,...,,,,,,,,,,
4,d9e41465789c2b5,15,negative,0,0,0,,,,,...,,,,,,,,,,


In [4]:
features = [
    'Leukocytes',
    'Monocytes',
    'Platelets',
    'Patient age quantile',
]

df_clean = df[features + ["SARS-Cov-2 exam result"]]

print(df_clean.shape)

# predicted label as simple integers
df_clean["SARS-Cov-2 exam result"].replace("positive", 1, inplace=True)
df_clean["SARS-Cov-2 exam result"].replace("negative", 0, inplace=True)

(5644, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [5]:
df_clean = df_clean.dropna()
print(df_clean.shape)

(601, 5)


In [6]:
X = df_clean.drop("SARS-Cov-2 exam result", axis=1)
y = df_clean["SARS-Cov-2 exam result"]

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit_transform(X)

array([[0.29434284, 0.37647056, 0.16839917, 0.89473684],
       [0.36452575, 0.33333333, 0.32952185, 0.05263158],
       [0.1735432 , 0.51372545, 0.17567568, 0.47368421],
       ...,
       [0.2245853 , 0.45882351, 0.17047818, 0.78947368],
       [0.04381117, 0.52941174, 0.06444907, 0.89473684],
       [0.11186731, 0.40784312, 0.13617464, 1.        ]])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(y_train.value_counts())
print(y_test.value_counts())

0    411
1     69
Name: SARS-Cov-2 exam result, dtype: int64
0    107
1     14
Name: SARS-Cov-2 exam result, dtype: int64


In [9]:
def _extract_feature_importance(model, features):
    if not hasattr(model, "coef_") and not hasattr(model, "feature_importances_"):
        raise Exception("Not possible to collect feature importances")

    if hasattr(model, "coef_"):
        model_feature_importances = model.coef_[0]
    elif hasattr(model, "feature_importances_"):
        model_feature_importances = model.feature_importances_

    return [
        (feature, importance)
        for feature, importance in sorted(
            zip(features, model_feature_importances),
            key=lambda pair: pair[1],
            reverse=True,
        )
    ]

In [10]:
def run_single(X_train, y_train, X_test, y_test, model):
    
    model.fit(X_train, y_train)

    # let us just print the performance on the train set
    predictions_train = model.predict(X_train)
    train_results = {
        "prec": precision_score(y_train, predictions_train),
        "rec": recall_score(y_train, predictions_train),
        "roc": roc_auc_score(y_train, predictions_train),
    }
    cm_train = confusion_matrix(y_train, predictions_train)
    
    # in the test set
    predictions_test = model.predict(X_test)
    test_results = {
        "prec": precision_score(y_test, predictions_test),
        "rec": recall_score(y_test, predictions_test),
        "roc": roc_auc_score(y_test, predictions_test),
    }

    cm_test = confusion_matrix(y_test, predictions_test)
    
    # feature importance
    feature_importance = _extract_feature_importance(model, X_train.columns.values)
    
    return {
        "test_results": test_results, 
        "train_results": train_results, 
        "test_cm" : cm_test,
        "train_cm" : cm_train,
        "feature_importance": feature_importance,
        "y_pred": predictions_test,
        "model": model
    }


In [11]:
model= RandomForestClassifier(random_state=42)

In [12]:
run_single(X_train, y_train, X_test, y_test, model)

{'test_results': {'prec': 0.75,
  'rec': 0.42857142857142855,
  'roc': 0.7049399198931909},
 'train_results': {'prec': 1.0, 'rec': 1.0, 'roc': 1.0},
 'test_cm': array([[105,   2],
        [  8,   6]], dtype=int64),
 'train_cm': array([[411,   0],
        [  0,  69]], dtype=int64),
 'feature_importance': [('Leukocytes', 0.3467766769729846),
  ('Platelets', 0.27506059778374203),
  ('Monocytes', 0.24620571848487013),
  ('Patient age quantile', 0.13195700675840338)],
 'y_pred': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64),
 'model': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
             

In [13]:
a=run_single(X_train, y_train, X_test, y_test, model)

In [14]:
a["model"]

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [15]:
X_test.head()

Unnamed: 0,Leukocytes,Monocytes,Platelets,Patient age quantile
693,-0.389586,-0.535402,-0.228491,11
4409,-0.576033,-0.509139,-1.421863,10
5232,0.077923,-0.824298,-0.454604,14
391,-0.230967,-0.193981,-0.07775,7
1280,-0.189225,0.357547,0.06043,4


In [16]:
new_input = [[0.005570,-0.614192,0.977442,13]]
y=model.predict(new_input)
print(y)

[0]


In [23]:
type(new_input[0][3])

int

In [16]:
#Create a pickle file using serialization
import pickle
pickle_out=open("model.pkl","wb")
pickle.dump(model,pickle_out)
pickle_out.close()

In [33]:
y=model.predict(X_test)
print(y)
count=0
for i in y:
    count=count+1
print(count)


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0]
121


In [56]:
print(X_test[:10].to_string(index=False))

 Leukocytes  Monocytes  Platelets  Patient age quantile
  -0.389586  -0.535402  -0.228491                    11
  -0.576033  -0.509139  -1.421863                    10
   0.077923  -0.824298  -0.454604                    14
  -0.230967  -0.193981  -0.077750                     7
  -0.189225   0.357547   0.060430                     4
  -0.386803   0.462600  -1.095256                    16
   0.726313   0.068652   1.429667                    14
  -0.550988  -0.272770   0.763892                     9
   1.046335   0.278757  -0.705840                    16
   0.478645   0.147441   1.907016                    14


In [61]:
ans=X_test

In [62]:
ans.to_csv('test_it.csv',index=False)

In [63]:
pd.read_csv("test_it.csv")

Unnamed: 0,Leukocytes,Monocytes,Platelets,Patient age quantile
0,-0.389586,-0.535402,-0.228491,11
1,-0.576033,-0.509139,-1.421863,10
2,0.077923,-0.824298,-0.454604,14
3,-0.230967,-0.193981,-0.077750,7
4,-0.189225,0.357547,0.060430,4
...,...,...,...,...
116,2.162234,-1.349562,1.567847,11
117,1.115904,-0.745508,-0.241053,13
118,0.339505,0.567652,-1.019885,0
119,0.670657,-1.533404,2.422050,12


In [64]:
load

Unnamed: 0,Leukocytes,Monocytes,Platelets,Patient age quantile
0,-0.389586,-0.535402,-0.228491,11
1,-0.576033,-0.509139,-1.421863,10
2,0.077923,-0.824298,-0.454604,14
3,-0.230967,-0.193981,-0.07775,7
4,-0.189225,0.357547,0.06043,4
5,-0.386803,0.4626,-1.095256,16
6,0.726313,0.068652,1.429667,14
7,-0.550988,-0.27277,0.763892,9
8,1.046335,0.278757,-0.70584,16
