Pandas Python Library for Data Science

In [2]:
import pandas as pd

In [3]:
print(pd.__version__)

2.3.1


In [4]:
A = pd.Series([1, 2, 3, 4], index = ['a', 'b', 'c', 'd'])

In [5]:
A.values #NumPy Array

array([1, 2, 3, 4])

In [6]:
type(A.values)

numpy.ndarray

In [7]:
A['a']

np.int64(1)

In [8]:
A['a': 'c']

a    1
b    2
c    3
dtype: int64

In [9]:
grades_dict = {'A': 4, 'B': 3.5, 'C': 3, 'D': 2.5}
grades = pd.Series(grades_dict) #Series object is for one dimensional data

In [10]:
grades.values

array([4. , 3.5, 3. , 2.5])

In [11]:
marks_dict = {'A': 85, 'B': 75, 'C': 65, 'D': 55}
marks = pd.Series(marks_dict)

In [12]:
marks

A    85
B    75
C    65
D    55
dtype: int64

In [13]:
marks['A']

np.int64(85)

In [14]:
marks[0:2]

A    85
B    75
dtype: int64

In [15]:
B = pd.DataFrame({'Marks': marks, 'Grades': grades}) #Creating a dataframe from series

In [16]:
B

Unnamed: 0,Marks,Grades
A,85,4.0
B,75,3.5
C,65,3.0
D,55,2.5


In [17]:
B.T

Unnamed: 0,A,B,C,D
Marks,85.0,75.0,65.0,55.0
Grades,4.0,3.5,3.0,2.5


In [18]:
B.values

array([[85. ,  4. ],
       [75. ,  3.5],
       [65. ,  3. ],
       [55. ,  2.5]])

In [19]:
B.values[2, 0]

np.float64(65.0)

In [20]:
B.columns

Index(['Marks', 'Grades'], dtype='object')

In [22]:
B.index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [23]:
B['Scaled for 90 Marks'] = (B['Marks']/90) * 100

In [24]:
B

Unnamed: 0,Marks,Grades,Scaled for 90 Marks
A,85,4.0,94.444444
B,75,3.5,83.333333
C,65,3.0,72.222222
D,55,2.5,61.111111


In [25]:
del B['Scaled for 90 Marks']

In [26]:
B

Unnamed: 0,Marks,Grades
A,85,4.0
B,75,3.5
C,65,3.0
D,55,2.5


In [28]:
C = B[B['Marks'] > 60]
C

Unnamed: 0,Marks,Grades
A,85,4.0
B,75,3.5
C,65,3.0


In [42]:
D = pd.DataFrame([{'a': 1, 'b': 2}, {'b': -3, 'c': 1}])

In [43]:
D

Unnamed: 0,a,b,c
0,1.0,2,
1,,-3,1.0


In [44]:
D.fillna(0)

Unnamed: 0,a,b,c
0,1.0,2,0.0
1,0.0,-3,1.0


In [45]:
D.dropna

<bound method DataFrame.dropna of      a  b    c
0  1.0  2  NaN
1  NaN -3  1.0>

In [62]:
#Indexing
#loc for explicit index
#iloc for implicit index (normal numbers)
E = pd.Series(['a', 'b', 'c'], index = [1, 3, 5])

In [63]:
E.loc[1:3]

1    a
3    b
dtype: object

In [64]:
E.iloc[1:3]

3    b
5    c
dtype: object

In [67]:
B

Unnamed: 0,Marks,Grades
A,85,4.0
B,75,3.5
C,65,3.0
D,55,2.5


In [81]:
B.iloc[2]

Marks     65.0
Grades     3.0
Name: C, dtype: float64

In [82]:
B.iloc[:,1]

A    4.0
B    3.5
C    3.0
D    2.5
Name: Grades, dtype: float64

In [83]:
from sklearn.impute import SimpleImputer

In [84]:
df = pd.read_csv('covid_19_data.csv')

In [87]:
df.head()

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0


In [89]:
df.drop(['Last Update', 'SNo'], axis = 1, inplace = True)
#Drop columns inplace

In [90]:
df.head()

Unnamed: 0,ObservationDate,Province/State,Country/Region,Confirmed,Deaths,Recovered
0,01/22/2020,Anhui,Mainland China,1.0,0.0,0.0
1,01/22/2020,Beijing,Mainland China,14.0,0.0,0.0
2,01/22/2020,Chongqing,Mainland China,6.0,0.0,0.0
3,01/22/2020,Fujian,Mainland China,1.0,0.0,0.0
4,01/22/2020,Gansu,Mainland China,0.0,0.0,0.0


In [92]:
df.rename(columns = {'ObservationDate': 'Date', 'Province/State': 'State', 'Country/Region': 'Country'}, inplace = True)

In [97]:
df['Date'] = pd.to_datetime(df['Date'], format = '%m/%d/%Y')

In [99]:
df.head()

Unnamed: 0,Date,State,Country,Confirmed,Deaths,Recovered
0,2020-01-22,Anhui,Mainland China,1.0,0.0,0.0
1,2020-01-22,Beijing,Mainland China,14.0,0.0,0.0
2,2020-01-22,Chongqing,Mainland China,6.0,0.0,0.0
3,2020-01-22,Fujian,Mainland China,1.0,0.0,0.0
4,2020-01-22,Gansu,Mainland China,0.0,0.0,0.0


In [100]:
df.describe()

Unnamed: 0,Date,Confirmed,Deaths,Recovered
count,306429,306429.0,306429.0,306429.0
mean,2020-11-06 01:54:54.146441728,85670.91,2036.403268,50420.29
min,2020-01-22 00:00:00,-302844.0,-178.0,-854405.0
25%,2020-07-30 00:00:00,1042.0,13.0,11.0
50%,2020-11-10 00:00:00,10375.0,192.0,1751.0
75%,2021-02-18 00:00:00,50752.0,1322.0,20270.0
max,2021-05-29 00:00:00,5863138.0,112385.0,6399531.0
std,,277551.6,6410.938048,201512.4


In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306429 entries, 0 to 306428
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   Date       306429 non-null  datetime64[ns]
 1   State      228326 non-null  object        
 2   Country    306429 non-null  object        
 3   Confirmed  306429 non-null  float64       
 4   Deaths     306429 non-null  float64       
 5   Recovered  306429 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(2)
memory usage: 14.0+ MB


In [102]:
df.fillna('N/A', inplace = True)

In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306429 entries, 0 to 306428
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   Date       306429 non-null  datetime64[ns]
 1   State      306429 non-null  object        
 2   Country    306429 non-null  object        
 3   Confirmed  306429 non-null  float64       
 4   Deaths     306429 non-null  float64       
 5   Recovered  306429 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(2)
memory usage: 14.0+ MB


In [104]:
df.head(10)

Unnamed: 0,Date,State,Country,Confirmed,Deaths,Recovered
0,2020-01-22,Anhui,Mainland China,1.0,0.0,0.0
1,2020-01-22,Beijing,Mainland China,14.0,0.0,0.0
2,2020-01-22,Chongqing,Mainland China,6.0,0.0,0.0
3,2020-01-22,Fujian,Mainland China,1.0,0.0,0.0
4,2020-01-22,Gansu,Mainland China,0.0,0.0,0.0
5,2020-01-22,Guangdong,Mainland China,26.0,0.0,0.0
6,2020-01-22,Guangxi,Mainland China,2.0,0.0,0.0
7,2020-01-22,Guizhou,Mainland China,1.0,0.0,0.0
8,2020-01-22,Hainan,Mainland China,4.0,0.0,0.0
9,2020-01-22,Hebei,Mainland China,1.0,0.0,0.0


In [108]:
df2 = df.groupby('Country')[['Confirmed', 'Deaths', 'Recovered']].sum().reset_index()

In [111]:
df2.head(10)

Unnamed: 0,Country,Confirmed,Deaths,Recovered
0,Azerbaijan,1.0,0.0,0.0
1,"('St. Martin',)",2.0,0.0,0.0
2,Afghanistan,17026442.0,669075.0,13464399.0
3,Albania,19768869.0,375955.0,13945256.0
4,Algeria,27684358.0,834464.0,18959299.0
5,Andorra,2379802.0,32100.0,2162473.0
6,Angola,4764863.0,116489.0,3683041.0
7,Antigua and Barbuda,143868.0,4059.0,109958.0
8,Argentina,504802880.0,12112441.0,438750295.0
9,Armenia,42536277.0,770759.0,37101575.0


In [114]:
df3 = df.groupby(['Country', 'Date'])[['Confirmed', 'Deaths', 'Recovered']].sum().reset_index()

In [115]:
df3.head(10)

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered
0,Azerbaijan,2020-02-28,1.0,0.0,0.0
1,"('St. Martin',)",2020-03-10,2.0,0.0,0.0
2,Afghanistan,2020-02-24,1.0,0.0,0.0
3,Afghanistan,2020-02-25,1.0,0.0,0.0
4,Afghanistan,2020-02-26,1.0,0.0,0.0
5,Afghanistan,2020-02-27,1.0,0.0,0.0
6,Afghanistan,2020-02-28,1.0,0.0,0.0
7,Afghanistan,2020-02-29,1.0,0.0,0.0
8,Afghanistan,2020-03-01,1.0,0.0,0.0
9,Afghanistan,2020-03-02,1.0,0.0,0.0


In [121]:
df4 = df3[df3['Confirmed'] >= 100]
#Confirmed cases greater than 100 on a given day in given country

In [122]:
df4.head()

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered
35,Afghanistan,2020-03-28,107.0,4.0,2.0
36,Afghanistan,2020-03-29,118.0,4.0,2.0
37,Afghanistan,2020-03-30,146.0,4.0,2.0
38,Afghanistan,2020-03-31,175.0,4.0,5.0
39,Afghanistan,2020-04-01,197.0,4.0,5.0
