# COVID-19 Dataset Exploration and Analysis

## Internship Task 1 – Data Exploration  

**Name:** Abid Ali  
**Company:** Code Sentinel  

## Importing Required Libraries

In [1]:
import pandas as pd

## 📊 Data Exploration using Pandas

In [2]:
# Took the data set from:
# https://catalog.data.gov/dataset/covid-19-cases-and-deaths-by-age-group

In [3]:
# 1. Load the raw data
df = pd.read_csv("COVID-19-Dataset.csv")

In [4]:
# Check number of rows and columns
print("Shape:", df.shape)   # (rows, columns)
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])

Shape: (5337, 9)
Number of rows: 5337
Number of columns: 9


In [5]:
# Checking missing values
df.isnull()

Unnamed: 0,DateUpdated,AgeGroups,Total cases,Confirmed cases,Probable cases,Total case rate,Total deaths,Confirmed deaths,Probable deaths
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
5332,False,False,False,False,False,False,False,False,False
5333,False,False,False,False,False,False,False,False,False
5334,False,False,False,False,False,False,False,False,False
5335,False,False,False,False,False,False,False,False,False


In [6]:
df.head()

Unnamed: 0,DateUpdated,AgeGroups,Total cases,Confirmed cases,Probable cases,Total case rate,Total deaths,Confirmed deaths,Probable deaths
0,05/27/2021,0-9,21266,19931.0,1335.0,5649.0,1,1.0,0.0
1,05/27/2021,Oct-19,41363,37591.0,3772.0,9128.0,3,3.0,0.0
2,05/27/2021,20-29,60808,54548.0,6260.0,13066.0,10,10.0,0.0
3,05/27/2021,30-39,53926,48970.0,4956.0,12189.0,41,39.0,2.0
4,05/27/2021,40-49,48453,44467.0,3986.0,11196.0,150,119.0,31.0


In [7]:
df.tail()

Unnamed: 0,DateUpdated,AgeGroups,Total cases,Confirmed cases,Probable cases,Total case rate,Total deaths,Confirmed deaths,Probable deaths
5332,05/26/2021,40-49,48434,44453.0,3981.0,11191.0,149,119.0,30.0
5333,05/26/2021,50-59,51834,48000.0,3834.0,9980.0,452,383.0,69.0
5334,05/26/2021,60-69,34348,32177.0,2171.0,7796.0,1084,934.0,150.0
5335,05/26/2021,70-79,17626,16584.0,1042.0,6528.0,1845,1561.0,284.0
5336,05/26/2021,80 and older,16524,15074.0,1450.0,10006.0,4642,3720.0,922.0


In [8]:
# Checking Data Types
df.dtypes

DateUpdated          object
AgeGroups            object
Total cases           int64
Confirmed cases     float64
Probable cases      float64
Total case rate     float64
Total deaths          int64
Confirmed deaths    float64
Probable deaths     float64
dtype: object

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5337 entries, 0 to 5336
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   DateUpdated       5337 non-null   object 
 1   AgeGroups         5337 non-null   object 
 2   Total cases       5337 non-null   int64  
 3   Confirmed cases   4833 non-null   float64
 4   Probable cases    4833 non-null   float64
 5   Total case rate   5336 non-null   float64
 6   Total deaths      5337 non-null   int64  
 7   Confirmed deaths  4833 non-null   float64
 8   Probable deaths   4833 non-null   float64
dtypes: float64(5), int64(2), object(2)
memory usage: 375.4+ KB


In [10]:
df.columns

Index(['DateUpdated', 'AgeGroups', 'Total cases', 'Confirmed cases',
       'Probable cases', 'Total case rate', 'Total deaths', 'Confirmed deaths',
       'Probable deaths'],
      dtype='object')

In [11]:
df.describe()

Unnamed: 0,Total cases,Confirmed cases,Probable cases,Total case rate,Total deaths,Confirmed deaths,Probable deaths
count,5337.0,4833.0,4833.0,5336.0,5337.0,4833.0,4833.0
mean,35974.38917,35819.734533,3588.389613,8950.797414,793.446693,693.751707,156.010759
std,33755.812122,29918.278975,3809.839755,7451.492038,1351.859386,1120.720904,281.171572
min,49.0,494.0,21.0,13.0,0.0,1.0,0.0
25%,7363.0,10386.0,668.0,1839.75,4.0,5.0,0.0
50%,25392.0,28507.0,2186.0,7839.0,112.0,119.0,29.0
75%,54229.0,52034.0,5083.0,12779.75,989.0,934.0,163.0
max,137619.0,120827.0,16792.0,39027.0,5808.0,4613.0,1195.0


In [12]:
df.isnull().sum() # getting the sum of mssing values in each column

DateUpdated           0
AgeGroups             0
Total cases           0
Confirmed cases     504
Probable cases      504
Total case rate       1
Total deaths          0
Confirmed deaths    504
Probable deaths     504
dtype: int64

In [13]:
age_summary = df.groupby('AgeGroups', observed=True)['Total cases'].sum()
age_summary

AgeGroups
0-9             14125578
20-29           32793554
30-39           30480631
40-49           26714643
50-59           27718477
60-69           18677354
70-79            9670716
80 and older     8916300
Oct-19          22898062
Name: Total cases, dtype: int64

In [14]:
# Babies (0-9)
babies = df[df['AgeGroups'] == '0-9']
babies

Unnamed: 0,DateUpdated,AgeGroups,Total cases,Confirmed cases,Probable cases,Total case rate,Total deaths,Confirmed deaths,Probable deaths
0,05/27/2021,0-9,21266,19931.0,1335.0,5649.0,1,1.0,0.0
9,05/28/2021,0-9,21298,19963.0,1335.0,5658.0,1,1.0,0.0
18,06/01/2021,0-9,21335,19988.0,1347.0,5667.0,1,1.0,0.0
27,06/02/2021,0-9,21343,19992.0,1351.0,5670.0,1,1.0,0.0
36,06/03/2021,0-9,21359,20006.0,1353.0,5674.0,1,1.0,0.0
...,...,...,...,...,...,...,...,...,...
5292,05/20/2021,0-9,21148,19829.0,1319.0,5618.0,1,1.0,0.0
5301,05/21/2021,0-9,21174,19852.0,1322.0,5625.0,1,1.0,0.0
5310,05/24/2021,0-9,21216,19890.0,1326.0,5636.0,1,1.0,0.0
5319,05/25/2021,0-9,21229,19898.0,1331.0,5639.0,1,1.0,0.0


In [15]:
# Young Adults (20-29)
young_adults = df[df['AgeGroups'] == '20-29']
young_adults

Unnamed: 0,DateUpdated,AgeGroups,Total cases,Confirmed cases,Probable cases,Total case rate,Total deaths,Confirmed deaths,Probable deaths
2,05/27/2021,20-29,60808,54548.0,6260.0,13066.0,10,10.0,0.0
11,05/28/2021,20-29,60837,54570.0,6267.0,13072.0,10,10.0,0.0
20,06/01/2021,20-29,60872,54599.0,6273.0,13080.0,10,10.0,0.0
29,06/02/2021,20-29,60883,54602.0,6281.0,13082.0,10,10.0,0.0
38,06/03/2021,20-29,60899,54619.0,6280.0,13086.0,10,10.0,0.0
...,...,...,...,...,...,...,...,...,...
5294,05/20/2021,20-29,60614,54392.0,6222.0,13024.0,10,10.0,0.0
5303,05/21/2021,20-29,60643,54406.0,6237.0,13031.0,10,10.0,0.0
5312,05/24/2021,20-29,60736,54492.0,6244.0,13051.0,10,10.0,0.0
5321,05/25/2021,20-29,60766,54512.0,6254.0,13057.0,10,10.0,0.0


In [16]:
# Adults (30-39, 40-49, 50-59)
adults = df[df['AgeGroups'].isin(['30-39', '40-49', '50-59'])]
adults

Unnamed: 0,DateUpdated,AgeGroups,Total cases,Confirmed cases,Probable cases,Total case rate,Total deaths,Confirmed deaths,Probable deaths
3,05/27/2021,30-39,53926,48970.0,4956.0,12189.0,41,39.0,2.0
4,05/27/2021,40-49,48453,44467.0,3986.0,11196.0,150,119.0,31.0
5,05/27/2021,50-59,51855,48017.0,3838.0,9984.0,453,383.0,70.0
12,05/28/2021,30-39,53964,48995.0,4969.0,12197.0,42,40.0,2.0
13,05/28/2021,40-49,48486,44498.0,3988.0,11203.0,150,119.0,31.0
...,...,...,...,...,...,...,...,...,...
5323,05/25/2021,40-49,48430,44449.0,3981.0,11190.0,148,118.0,30.0
5324,05/25/2021,50-59,51826,47991.0,3835.0,9978.0,452,383.0,69.0
5331,05/26/2021,30-39,53900,48949.0,4951.0,12183.0,41,39.0,2.0
5332,05/26/2021,40-49,48434,44453.0,3981.0,11191.0,149,119.0,30.0


In [17]:
# Old (60-69, 70-79, 80+)
old = df[df['AgeGroups'].isin(['60-69', '70-79', '80+'])]
old

Unnamed: 0,DateUpdated,AgeGroups,Total cases,Confirmed cases,Probable cases,Total case rate,Total deaths,Confirmed deaths,Probable deaths
6,05/27/2021,60-69,34355,32182.0,2173.0,7798.0,1084,934.0,150.0
7,05/27/2021,70-79,17628,16587.0,1041.0,6528.0,1846,1562.0,284.0
15,05/28/2021,60-69,34361,32185.0,2176.0,7799.0,1087,936.0,151.0
16,05/28/2021,70-79,17631,16588.0,1043.0,6530.0,1847,1563.0,284.0
24,06/01/2021,60-69,34375,32198.0,2177.0,7802.0,1088,937.0,151.0
...,...,...,...,...,...,...,...,...,...
5317,05/24/2021,70-79,17615,16578.0,1037.0,6524.0,1842,1558.0,284.0
5325,05/25/2021,60-69,34339,32167.0,2172.0,7794.0,1082,932.0,150.0
5326,05/25/2021,70-79,17620,16582.0,1038.0,6525.0,1843,1559.0,284.0
5334,05/26/2021,60-69,34348,32177.0,2171.0,7796.0,1084,934.0,150.0


In [18]:
# Highest Cases
top_days = df.sort_values('Total cases', ascending=False).head(10)
top_days

Unnamed: 0,DateUpdated,AgeGroups,Total cases,Confirmed cases,Probable cases,Total case rate,Total deaths,Confirmed deaths,Probable deaths
2414,06/24/2022,20-29,137619,120827.0,16792.0,29085.0,28,25.0,3.0
2405,06/23/2022,20-29,137520,120740.0,16780.0,29064.0,28,25.0,3.0
2396,06/22/2022,20-29,137414,120654.0,16760.0,29041.0,28,25.0,3.0
2387,06/21/2022,20-29,137324,120584.0,16740.0,29022.0,28,25.0,3.0
2378,06/20/2022,20-29,137267,120540.0,16727.0,29010.0,28,25.0,3.0
2369,06/17/2022,20-29,137090,120369.0,16721.0,28973.0,28,25.0,3.0
2216,06/16/2022,20-29,137001,120298.0,16703.0,29438.0,28,25.0,3.0
2207,06/15/2022,20-29,136935,120240.0,16695.0,28940.0,28,25.0,3.0
2198,06/14/2022,20-29,136834,120154.0,16680.0,28919.0,28,25.0,3.0
2081,06/13/2022,20-29,136758,120092.0,16666.0,28903.0,28,25.0,3.0
