#### Downloading Dataset

In [None]:
from urllib.request import urlretrieve
urlretrieve('https://hub.jovian.ml/wp-content/uploads/2020/09/italy-covid-daywise.csv', 
            'italy-covid-daywise.csv')


#### Reading CSV file in Pandas

In [1]:
import pandas as pd

In [2]:
covid_df = pd.read_csv("italy-covid-daywise.csv") #it reads the file as Dataframe Format

In [3]:
type(covid_df) 

pandas.core.frame.DataFrame

In [4]:
covid_df

Unnamed: 0,date,new_cases,new_deaths,new_tests
0,2019-12-31,0.0,0.0,
1,2020-01-01,0.0,0.0,
2,2020-01-02,0.0,0.0,
3,2020-01-03,0.0,0.0,
4,2020-01-04,0.0,0.0,
...,...,...,...,...
243,2020-08-30,1444.0,1.0,53541.0
244,2020-08-31,1365.0,4.0,42583.0
245,2020-09-01,996.0,6.0,54395.0
246,2020-09-02,975.0,8.0,


In [5]:
covid_df.info() #used getting information of number counts of features , data type of features

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        248 non-null    object 
 1   new_cases   248 non-null    float64
 2   new_deaths  248 non-null    float64
 3   new_tests   135 non-null    float64
dtypes: float64(3), object(1)
memory usage: 7.9+ KB


In [6]:
covid_df.describe() # used for getting mean , std , min , max of features

Unnamed: 0,new_cases,new_deaths,new_tests
count,248.0,248.0,135.0
mean,1094.818548,143.133065,31699.674074
std,1554.508002,227.105538,11622.209757
min,-148.0,-31.0,7841.0
25%,123.0,3.0,25259.0
50%,342.0,17.0,29545.0
75%,1371.75,175.25,37711.0
max,6557.0,971.0,95273.0


In [7]:
covid_df.columns # the column property will retrieve columns in the dataset

Index(['date', 'new_cases', 'new_deaths', 'new_tests'], dtype='object')

In [8]:
covid_df.shape # it returns the shape of dataset 

(248, 4)

In [9]:
import jovian

<IPython.core.display.Javascript object>

[jovian] Update Available: 0.2.26 --> 0.2.28
[jovian] Run `!pip install jovian --upgrade` to upgrade


In [None]:
jovian.commit(project='python-pandas-data-analysis')


#### Retreiving Data From a Dataset
Let's first understand inner workings of dataframe.

In [10]:
covid_data_dict = {
    'date':['2020-08-30', '2020-08-31', '2020-09-01', '2020-09-02', '2020-09-03'],
    'new_cased':[1444, 1365, 996, 975, 1326],
    'new_deaths':[1, 4, 6, 8, 6],
    'new_tests':[53541, 42583, 54395, None, None]
}

In [11]:
covid_data_dict['new_cased']

[1444, 1365, 996, 975, 1326]

In [12]:
covid_df['new_cases'] # each column is stored as a series

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
        ...  
243    1444.0
244    1365.0
245     996.0
246     975.0
247    1326.0
Name: new_cases, Length: 248, dtype: float64

In [13]:
type(covid_df['new_cases'])

pandas.core.series.Series

##### like arrays we can retreive values using index of array

In [14]:
covid_df['new_cases'][240]

1366.0

#### pandas provide at() method for retreiving array values with index

In [17]:
covid_df.at[240,'new_tests']

57640.0

##### we can access columns(if not spaced) using . property

In [18]:
covid_df.new_tests

0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
        ...   
243    53541.0
244    42583.0
245    54395.0
246        NaN
247        NaN
Name: new_tests, Length: 248, dtype: float64

##### we can also retreive set of columns

In [20]:
cases_df = covid_df[['date','new_cases']]
cases_df

Unnamed: 0,date,new_cases
0,2019-12-31,0.0
1,2020-01-01,0.0
2,2020-01-02,0.0
3,2020-01-03,0.0
4,2020-01-04,0.0
...,...,...
243,2020-08-30,1444.0
244,2020-08-31,1365.0
245,2020-09-01,996.0
246,2020-09-02,975.0


##### cases_df is a subset of covid_df , if one's data is changed it will reflect in the other one. For this pandas provide copy() to copy the dataframe to another and by changing one does not affect other.

In [21]:
copy_covid_df = covid_df.copy()

##### for retreving specific row data , pandas provide .loc[] method

In [22]:
covid_df.loc[243]

date          2020-08-30
new_cases           1444
new_deaths             1
new_tests          53541
Name: 243, dtype: object

In [23]:
covid_df.head(5)

Unnamed: 0,date,new_cases,new_deaths,new_tests
0,2019-12-31,0.0,0.0,
1,2020-01-01,0.0,0.0,
2,2020-01-02,0.0,0.0,
3,2020-01-03,0.0,0.0,
4,2020-01-04,0.0,0.0,


In [24]:
covid_df.tail(5)

Unnamed: 0,date,new_cases,new_deaths,new_tests
243,2020-08-30,1444.0,1.0,53541.0
244,2020-08-31,1365.0,4.0,42583.0
245,2020-09-01,996.0,6.0,54395.0
246,2020-09-02,975.0,8.0,
247,2020-09-03,1326.0,6.0,


In [25]:
covid_df.at[0,'new_tests']

nan

In [26]:
type(covid_df.at[0,'new_tests'])

numpy.float64

##### pandas provide first_valid_index() function to retreive where the valid number comes in the column

In [27]:
covid_df.new_tests.first_valid_index()

111

In [29]:
covid_df.loc[108:113]

Unnamed: 0,date,new_cases,new_deaths,new_tests
108,2020-04-17,3786.0,525.0,
109,2020-04-18,3493.0,575.0,
110,2020-04-19,3491.0,480.0,
111,2020-04-20,3047.0,433.0,7841.0
112,2020-04-21,2256.0,454.0,28095.0
113,2020-04-22,2729.0,534.0,44248.0


##### we can also use .sampe() method to retreive random samples from data.

In [30]:
covid_df.sample(10)

Unnamed: 0,date,new_cases,new_deaths,new_tests
73,2020-03-13,2651.0,189.0,
173,2020-06-21,264.0,49.0,24581.0
38,2020-02-07,0.0,0.0,
55,2020-02-24,53.0,0.0,
28,2020-01-28,0.0,0.0,
99,2020-04-08,3039.0,604.0,
208,2020-07-26,274.0,5.0,25177.0
106,2020-04-15,2972.0,604.0,
138,2020-05-17,875.0,153.0,33505.0
200,2020-07-18,231.0,11.0,27569.0


# Analyzing data from dataframes

Q: What are the total number of reported cases and deaths related to Covid-19 in Italy? <br>
Series object in pandas support numpy operations

In [32]:
total_cases = covid_df.new_cases.sum()
total_cases

271515.0

In [34]:
total_deaths = covid_df.new_deaths.sum()
total_deaths

35497.0

In [35]:
print('The number of reported cases is {} and the number of reported deaths is {}.'.format(int(total_cases), int(total_deaths)))


The number of reported cases is 271515 and the number of reported deaths is 35497.


Q: What is the overall death rate (ratio of reported deaths to reported cases)?



In [36]:
death_rate = covid_df.new_tests.sum() / covid_df.new_cases.sum()

In [37]:
print("The overall reported death rate in Italy is {:.2f} %.".format(death_rate*100))


The overall reported death rate in Italy is 1576.14 %.


Q: What is the overall number of tests conducted? A total of 935310 tests were conducted before daily test numbers were reported.

In [39]:
intial_tests = 935310
total_tests = covid_df.new_tests.sum() + intial_tests

In [40]:
total_tests

5214766.0

Q: What fraction of tests returned a positive result?



In [44]:
positive_rate= total_cases / total_tests

In [45]:
positive_rate

0.05206657403227681

In [46]:
print('{:.2f}% of tests in Italy led to a positive diagnosis.'.format(positive_rate*100))


5.21% of tests in Italy led to a positive diagnosis.
