In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
sns.set(context='notebook', style='darkgrid', palette='colorblind', font='sans-serif', font_scale=1, rc=None)
matplotlib.rcParams['figure.figsize'] =[8,8]
matplotlib.rcParams.update({'font.size': 15})
matplotlib.rcParams['font.family'] = 'sans-serif'

In [2]:
df = pd.read_csv('Covid Dataset.csv')
df.head()

Unnamed: 0,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Asthma,Chronic Lung Disease,Headache,Heart Disease,Diabetes,...,Fatigue,Gastrointestinal,Abroad travel,Contact with COVID Patient,Attended Large Gathering,Visited Public Exposed Places,Family working in Public Exposed Places,Wearing Masks,Sanitization from Market,COVID-19
0,Yes,Yes,Yes,Yes,Yes,No,No,No,No,Yes,...,Yes,Yes,No,Yes,No,Yes,Yes,No,No,Yes
1,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,No,No,...,Yes,No,No,No,Yes,Yes,No,No,No,Yes
2,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,Yes,...,Yes,Yes,Yes,No,No,No,No,No,No,Yes
3,Yes,Yes,Yes,No,No,Yes,No,No,Yes,Yes,...,No,No,Yes,No,Yes,Yes,No,No,No,Yes
4,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,...,No,Yes,No,Yes,No,Yes,No,No,No,Yes


In [3]:
df.describe(include = 'all')

Unnamed: 0,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Asthma,Chronic Lung Disease,Headache,Heart Disease,Diabetes,...,Fatigue,Gastrointestinal,Abroad travel,Contact with COVID Patient,Attended Large Gathering,Visited Public Exposed Places,Family working in Public Exposed Places,Wearing Masks,Sanitization from Market,COVID-19
count,5434,5434,5434,5434,5434,5434,5434,5434,5434,5434,...,5434,5434,5434,5434,5434,5434,5434,5434,5434,5434
unique,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,1,1,2
top,Yes,Yes,Yes,Yes,Yes,No,No,Yes,No,No,...,Yes,No,No,Yes,No,Yes,No,No,No,Yes
freq,3620,4273,4307,3953,2952,2920,2869,2736,2911,2846,...,2821,2883,2983,2726,2924,2820,3172,5434,5434,4383


In [4]:
df.shape

(5434, 21)

In [5]:
from sklearn.preprocessing import LabelEncoder
e=LabelEncoder()

In [6]:
for column in df.columns:
    if df[column].dtype == 'object':  
        df[column] = e.fit_transform(df[column])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5434 entries, 0 to 5433
Data columns (total 21 columns):
 #   Column                                   Non-Null Count  Dtype
---  ------                                   --------------  -----
 0   Breathing Problem                        5434 non-null   int64
 1   Fever                                    5434 non-null   int64
 2   Dry Cough                                5434 non-null   int64
 3   Sore throat                              5434 non-null   int64
 4   Running Nose                             5434 non-null   int64
 5   Asthma                                   5434 non-null   int64
 6   Chronic Lung Disease                     5434 non-null   int64
 7   Headache                                 5434 non-null   int64
 8   Heart Disease                            5434 non-null   int64
 9   Diabetes                                 5434 non-null   int64
 10  Hyper Tension                            5434 non-null   int64
 11  Fati

In [8]:
df=df.drop('Running Nose',axis=1)
df=df.drop('Chronic Lung Disease',axis=1)
df=df.drop('Headache',axis=1)
df=df.drop('Heart Disease',axis=1)
df=df.drop('Diabetes',axis=1)
df=df.drop('Gastrointestinal ',axis=1)
df=df.drop('Wearing Masks',axis=1)
df=df.drop('Sanitization from Market',axis=1)
df=df.drop('Asthma',axis=1)

df.head()

Unnamed: 0,Breathing Problem,Fever,Dry Cough,Sore throat,Hyper Tension,Fatigue,Abroad travel,Contact with COVID Patient,Attended Large Gathering,Visited Public Exposed Places,Family working in Public Exposed Places,COVID-19
0,1,1,1,1,1,1,0,1,0,1,1,1
1,1,1,1,1,0,1,0,0,1,1,0,1
2,1,1,1,1,0,1,1,0,0,0,0,1
3,1,1,1,0,0,0,1,0,1,1,0,1
4,1,1,1,1,1,0,0,1,0,1,0,1


In [9]:
print(df[df['COVID-19'] == 1].count())

Breathing Problem                          4383
Fever                                      4383
Dry Cough                                  4383
Sore throat                                4383
Hyper Tension                              4383
Fatigue                                    4383
Abroad travel                              4383
Contact with COVID Patient                 4383
Attended Large Gathering                   4383
Visited Public Exposed Places              4383
Family working in Public Exposed Places    4383
COVID-19                                   4383
dtype: int64


In [10]:
df = df.astype('category')
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5434 entries, 0 to 5433
Data columns (total 12 columns):
 #   Column                                   Non-Null Count  Dtype   
---  ------                                   --------------  -----   
 0   Breathing Problem                        5434 non-null   category
 1   Fever                                    5434 non-null   category
 2   Dry Cough                                5434 non-null   category
 3   Sore throat                              5434 non-null   category
 4   Hyper Tension                            5434 non-null   category
 5   Fatigue                                  5434 non-null   category
 6   Abroad travel                            5434 non-null   category
 7   Contact with COVID Patient               5434 non-null   category
 8   Attended Large Gathering                 5434 non-null   category
 9   Visited Public Exposed Places            5434 non-null   category
 10  Family working in Public Exposed Pla

In [11]:
X = df.iloc[:, :11]
y = df['COVID-19']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [13]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
print("Before undersampling: ", Counter(y_train))
undersample = RandomUnderSampler(sampling_strategy='majority')
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)
print("After undersampling: ", Counter(y_train_under))

Before undersampling:  Counter({1: 3083, 0: 720})
After undersampling:  Counter({0: 720, 1: 720})


In [14]:
from sklearn.preprocessing import MinMaxScaler

sc = MinMaxScaler(feature_range=(0,1))
X_scaled = sc.fit_transform(X)

In [15]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

model = SVC()
model.fit(X_train, y_train)

In [16]:
covid_model = model.predict(X_test)
accuracy_score(y_test, covid_model)

0.976701410177805