## Importing Necessary Libraries

In [None]:
!pip install mtranslate
!pip install xlsxwriter



In [None]:
import pandas as pd
import mtranslate
from mtranslate import translate

## Loading the Data

In [4]:
data = pd.read_csv("/content/drive/MyDrive/Sem 7 Project/corona_tested_individuals_ver_00198.csv")
data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
0,2021-11-14,0,0,0,0,0,שלילי,,נקבה,Other
1,2021-11-14,0,0,0,0,0,שלילי,,זכר,Other
2,2021-11-14,0,0,0,0,0,שלילי,,נקבה,Other
3,2021-11-14,0,0,0,0,0,שלילי,,זכר,Other
4,2021-11-14,0,0,0,0,0,שלילי,,נקבה,Other


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7810748 entries, 0 to 7810747
Data columns (total 10 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   test_date            object
 1   cough                int64 
 2   fever                int64 
 3   sore_throat          int64 
 4   shortness_of_breath  int64 
 5   head_ache            int64 
 6   corona_result        object
 7   age_60_and_above     object
 8   gender               object
 9   test_indication      object
dtypes: int64(5), object(5)
memory usage: 595.9+ MB


In [6]:
data.isnull().sum()

test_date                    0
cough                        0
fever                        0
sore_throat                  0
shortness_of_breath          0
head_ache                    0
corona_result                0
age_60_and_above       1728213
gender                  591235
test_indication              0
dtype: int64

## Translating the Features from Hebrew to English

In [7]:
def trans(x):
  return translate(x,"en","auto")

In [8]:
# Corona Result Feature
l = list(data['corona_result'].unique())
print("Target Column Levels:")
for i in l:
  print(i,trans(i))

# Gender Feature
l = list(data['gender'].unique())
print("\nGender Levels:")
for i in l:
  if i is not l[2]:
    print(i,trans(i))

Target Column Levels:
שלילי Negative
אחר Other
חיובי Positive

Gender Levels:
נקבה female
זכר male


In [9]:
def translate(data):
  data = data.replace(to_replace ="שלילי", value ="Negative")
  data = data.replace(to_replace ="אחר", value ="Other")
  data = data.replace(to_replace ="חיובי", value ="Positive")
  data = data.replace(to_replace ="נקבה", value ="female")
  data = data.replace(to_replace ="זכר", value ="male")
  return data 

data = translate(data)

## Checking Translated Data

In [10]:
def show(data):
  for i in data.columns[1:]:
    print("Feature: {} with {} Levels".format(i,data[i].unique()))

show(data)

Feature: cough with [0 1] Levels
Feature: fever with [0 1] Levels
Feature: sore_throat with [0 1] Levels
Feature: shortness_of_breath with [0 1] Levels
Feature: head_ache with [0 1] Levels
Feature: corona_result with ['Negative' 'Other' 'Positive'] Levels
Feature: age_60_and_above with [nan 'Yes' 'No'] Levels
Feature: gender with ['female' 'male' nan] Levels
Feature: test_indication with ['Other' 'Contact with confirmed' 'Abroad'] Levels


## Data Preprocessing

In [11]:
# Omitting all rows with missing values
data.dropna(inplace = True)

# Dropping all rows where corona_result = Other
data.drop(data[data['corona_result']=="Other"].index,inplace = True)

## Dividing the data Year Wise

In [12]:
data['test_date'] = pd.to_datetime(data['test_date'])
data = data.sort_values(by='test_date')
data.head()

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
7799798,2020-03-20,0,0,0,0,0,Negative,No,male,Other
7798498,2020-03-20,0,0,0,0,0,Negative,No,male,Other
7798499,2020-03-20,0,0,0,0,0,Negative,No,female,Other
7798500,2020-03-20,0,0,0,0,0,Negative,No,female,Other
7798501,2020-03-20,0,0,0,0,0,Negative,No,male,Other


In [13]:
data.tail()

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
280113,2021-10-11,0,0,0,0,0,Negative,No,female,Other
280112,2021-10-11,0,0,0,0,0,Negative,No,female,Other
280111,2021-10-11,0,0,0,0,0,Negative,No,male,Other
280118,2021-10-11,0,0,0,0,0,Negative,No,female,Other
271754,2021-10-11,0,0,0,0,0,Negative,Yes,female,Other


In [14]:
#df1 = data.loc[data['test_date'] < pd.to_datetime('2020-7-1')]
#df2 = data.loc[(data['test_date'] >= pd.to_datetime('2020-7-1')) & (data['test_date'] < pd.to_datetime('2020-9-1'))]
#df3 = data.loc[(data['test_date'] >= pd.to_datetime('2020-9-1')) & (data['test_date'] < pd.to_datetime('2021-1-1'))]
#df4 = data.loc[(data['test_date'] >= pd.to_datetime('2021-1-1')) & (data['test_date'] < pd.to_datetime('2021-5-1'))]
#df5 = data.loc[(data['test_date'] >= pd.to_datetime('2021-5-1')) & (data['test_date'] < pd.to_datetime('2021-8-1'))]
#df6 = data.loc[data['test_date'] >= pd.to_datetime('2021-8-1')]
df1 = data.loc[data['test_date'] < pd.to_datetime('2021-1-1')]
df2 = data.loc[data['test_date'] >= pd.to_datetime('2021-1-1')]

## Checking for both dataframes

In [15]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3238373 entries, 7799798 to 3963486
Data columns (total 10 columns):
 #   Column               Dtype         
---  ------               -----         
 0   test_date            datetime64[ns]
 1   cough                int64         
 2   fever                int64         
 3   sore_throat          int64         
 4   shortness_of_breath  int64         
 5   head_ache            int64         
 6   corona_result        object        
 7   age_60_and_above     object        
 8   gender               object        
 9   test_indication      object        
dtypes: datetime64[ns](1), int64(5), object(4)
memory usage: 271.8+ MB


In [16]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2623107 entries, 3926173 to 271754
Data columns (total 10 columns):
 #   Column               Dtype         
---  ------               -----         
 0   test_date            datetime64[ns]
 1   cough                int64         
 2   fever                int64         
 3   sore_throat          int64         
 4   shortness_of_breath  int64         
 5   head_ache            int64         
 6   corona_result        object        
 7   age_60_and_above     object        
 8   gender               object        
 9   test_indication      object        
dtypes: datetime64[ns](1), int64(5), object(4)
memory usage: 220.1+ MB


In [18]:
#df1.isnull().sum()
#df2.isnull().sum()
show(df1)

Feature: cough with [0 1] Levels
Feature: fever with [0 1] Levels
Feature: sore_throat with [0 1] Levels
Feature: shortness_of_breath with [0 1] Levels
Feature: head_ache with [0 1] Levels
Feature: corona_result with ['Negative' 'Positive'] Levels
Feature: age_60_and_above with ['No' 'Yes'] Levels
Feature: gender with ['male' 'female'] Levels
Feature: test_indication with ['Other' 'Abroad' 'Contact with confirmed'] Levels


In [19]:
df1['corona_result'].value_counts()

Negative    2979142
Positive     259231
Name: corona_result, dtype: int64

In [20]:
df2['corona_result'].value_counts()

Negative    2376665
Positive     246442
Name: corona_result, dtype: int64

## Saving the data

In [22]:
df1.to_csv("/content/drive/MyDrive/Sem 7 Project/covid_data_2020.csv",index=False)
df2.to_csv("/content/drive/MyDrive/Sem 7 Project/covid_data_2021.csv",index=False)

## Conclusion 

1. Data is of Israel Country.
2. Data consists of 7810748 records (Mar 2020 to Nov 2021).
3. Age_60_and_above has 1728213 Null Values & Gender has 591235 Null Values.
4. Translating Features from Hebrew Language to English.
5. Dropping NULL values and tuples where corona_result = "Other" to keep the problem as Binary Classification Problem.
6. Dividing the Data Year Wise.

  ![](https://raw.githubusercontent.com/mykeysid10/Sem-7-Covid/main/Phase-1-ML/Big%20Data%20Split.png?token=AQ3OOQ2NQUEAHWEHXH3TRSTBUTQJ6)
7. References
  * [Dataset Link](https://data.gov.il/dataset/covid-19/resource/d337959a-020a-4ed3-84f7-fca182292308?filters=)
  * [ReadMe Pdf](https://github.com/mykeysid10/Sem-7-Covid/blob/main/Phase-1-ML/Dataset%20Description.pdf)
