# COVID-19 Infection Prediction
#### Predicting whether a person is infected with COVID-19 or not based on the COVID-19 dataset taken from Data World.

By: Anamika Singh

In [1]:
# IMPORTING REQUIRED LIBRARIES

import pandas as pd
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

The data I'm working on is a medium dataset having 2,78,848 rows and 10 columns, including the target variable i.e whether a person is infected or not.

In [2]:
# LOADING THE DATASET

df = pd.read_csv("risk.csv", low_memory = False)
df.head(3)

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
0,2020-04-30,0,0,0,0,0,negative,,female,Other
1,2020-04-30,1,0,0,0,0,negative,,female,Other
2,2020-04-30,0,1,0,0,0,negative,,male,Other


It is a good practice to always have the target variable as the last column in the dataset. 

In [3]:
# RE-STRUCTURING DATA

df.pop('test_date')
move = df.pop('corona_result')
df.insert(8, 'corona_result', move)

Missing values are usually represented in the form of Nan or null or None in the dataset.

df.info() function can be used to give information about the dataset. This will provide you with the column names along with the number of non – null values in each column.

Another way of finding null values in the data is by using the isnull() function.

In [4]:
# CHECKING FOR MISSING VLAUES

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278848 entries, 0 to 278847
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   cough                278848 non-null  object
 1   fever                278848 non-null  object
 2   sore_throat          278848 non-null  object
 3   shortness_of_breath  278848 non-null  object
 4   head_ache            278848 non-null  object
 5   age_60_and_above     278848 non-null  object
 6   gender               278848 non-null  object
 7   test_indication      278848 non-null  object
 8   corona_result        278848 non-null  object
dtypes: object(9)
memory usage: 19.1+ MB


So, our data has no missing values. 
Dropping unnecessary rows in order to clean the data for modelling.

In [5]:
df = df.drop(df[df['cough'] == 'None'].index)
df = df.drop(df[df['fever'] == 'None'].index)
df = df.drop(df[df['sore_throat'] == 'None'].index)
df = df.drop(df[df['corona_result'] == 'other'].index)

In [6]:
# PREVIEWING COMPLETE DATA

pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,age_60_and_above,gender,test_indication,corona_result
0,0,0,0,0,0,,female,Other,negative
1,1,0,0,0,0,,female,Other,negative
2,0,1,0,0,0,,male,Other,negative
3,1,0,0,0,0,,female,Other,negative
4,1,0,0,0,0,,male,Other,negative


Using Label Encoding, we replace the categorical value with a numeric value between 0 and the number of classes minus 1. If the categorical variable value contains 5 distinct classes, we use (0, 1, 2, 3, and 4).

Using the preprocessing module from sklearn package and LabelEncoder class. Also printing the labels with their corresponding values for quick reference.

In [7]:
# LABEL ENCODING

df["cough"] = le.fit_transform(df["cough"])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print('cough labels:',le_name_mapping)

df["fever"] = le.fit_transform(df["fever"])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print('fever labels:',le_name_mapping)

df["sore_throat"] = le.fit_transform(df["sore_throat"])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print('sore_throat labels:',le_name_mapping)

df["shortness_of_breath"] = le.fit_transform(df["shortness_of_breath"])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print('shortness_of_breath labels:',le_name_mapping)

df["head_ache"] = le.fit_transform(df["head_ache"])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print('head_ache labels:',le_name_mapping)

df["corona_result"] = le.fit_transform(df["corona_result"])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print('corona_result labels:',le_name_mapping)

df["age_60_and_above"] = le.fit_transform(df["age_60_and_above"])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print('age_60_and_above labels:',le_name_mapping)

df["gender"] = le.fit_transform(df["gender"])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print('gender labels:',le_name_mapping) 

df["test_indication"] = le.fit_transform(df["test_indication"])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print('test_indication labels:',le_name_mapping)

cough labels: {'0': 0, '1': 1}
fever labels: {'0': 0, '1': 1}
sore_throat labels: {'0': 0, '1': 1}
shortness_of_breath labels: {'0': 0, '1': 1}
head_ache labels: {'0': 0, '1': 1}
corona_result labels: {'negative': 0, 'positive': 1}
age_60_and_above labels: {'No': 0, 'None': 1, 'Yes': 2}
gender labels: {'None': 0, 'female': 1, 'male': 2}
test_indication labels: {'Abroad': 0, 'Contact with confirmed': 1, 'Other': 2}


The train-test split procedure is used to estimate the performance of machine learning algorithms when they are used to make predictions on data not used to train the model.

The scikit-learn Python machine learning library provides an implementation of the train-test split evaluation procedure via the train_test_split() function.

In [8]:
# SPLITTING THE DATA SET INTO DEPENDENT AND INDEPENDENT COLUMNS

x = df.iloc[:, :8]
y = df.iloc[:, 8]

In [9]:
# TRAIN-TEST SPLITTING

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1700)

Here, I'm applying the Logisitic Regression algorithm for classifying the data as per the target variable. 

In [10]:
# APPLYING LOGISTIC REGRESSION MODEL

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

classifier.fit(x_train, y_train)
#print(classifier.predict([[56,7000]]))

LogisticRegression()

In [11]:
import numpy as np

np.array(y_test)

array([0, 0, 0, ..., 0, 0, 0])

Predicting the trained results and evaluating model accuracy.

In [12]:
# PREDICTING RESULTS

y_pred = classifier.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
# ACCURACY SCORE & CLASSIFICATION REPORT

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

cm = confusion_matrix(y_test, y_pred)
print(cm)
print("\n\n",accuracy_score(y_test, y_pred))
print("\n\n",classification_report(y_test,y_pred))

[[77771   172]
 [ 3406  1062]]


 0.9565834657994685


               precision    recall  f1-score   support

           0       0.96      1.00      0.98     77943
           1       0.86      0.24      0.37      4468

    accuracy                           0.96     82411
   macro avg       0.91      0.62      0.68     82411
weighted avg       0.95      0.96      0.94     82411



In [14]:
# GIVING PREDICTIONS

print(classifier.predict([[0, 1, 0, 0, 0, 0, 1, 2]]))
print(classifier.predict([[1, 1, 1, 1, 0, 0, 1, 2]]))

[0]
[1]


In [15]:
# APPLYING KNN

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 7)

knn.fit(x_train, y_train)
pred = knn.predict(x_test)
accuracy_score(y_test, pred)

0.9677348897598621

In [16]:
# SUPPORT VECTOR CLASSIFIER

from sklearn.svm import SVC
svcclassifier = SVC(kernel = "rbf", random_state = 20)

svcclassifier.fit(x_train, y_train)
pred = svcclassifier.predict(x_test)
accuracy_score(y_test, pred)

0.9697734525730788

In [17]:
# DECISION TREE CLASSIFIER

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
dtree = DecisionTreeClassifier(criterion = "gini", max_depth = 3, random_state = 20)

dtree.fit(x_train,y_train)
pred = dtree.predict(x_test)
accuracy_score(y_test, pred)

0.9653444321753164

In [18]:
# RANDOM FOREST CLASSIFIER

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 15, random_state = 20) 

RF.fit(x_train, y_train)
pred = RF.predict(x_test)
accuracy_score(y_test, pred)

0.9697491839681596

In [19]:
# GRADIENT BOOSTING CLASSIFIER

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
gb_model = GradientBoostingClassifier(n_estimators = 15, random_state = 20)

gb_model.fit(x_train, y_train)
roc_auc_score(y_test, gb_model.predict_proba(x_test)[:, 1])

0.893910193203993