# Pandas: Data manipulation

In [1]:
import pandas as pd

In [2]:
data = {"students":["Adam", "Monica", "John"], #Create a simple data via dictionary
     "born":[1994, 1989, 2011],
     "academic degree":[None,"Bc.","MSc."],
     "active":[True,False,False]}


In [3]:
data

{'students': ['Adam', 'Monica', 'John'],
 'born': [1994, 1989, 2011],
 'academic degree': [None, 'Bc.', 'MSc.'],
 'active': [True, False, False]}

In [4]:
df = pd.DataFrame(data) #transform data to pandas DataFrame

In [5]:
df

Unnamed: 0,students,born,academic degree,active
0,Adam,1994,,True
1,Monica,1989,Bc.,False
2,John,2011,MSc.,False


In [6]:
type(df)

pandas.core.frame.DataFrame

In [7]:
df["children"] = [0,1,2] #add a new column "children"

In [8]:
df

Unnamed: 0,students,born,academic degree,active,children
0,Adam,1994,,True,0
1,Monica,1989,Bc.,False,1
2,John,2011,MSc.,False,2


In [9]:
#create new data, which we want to append to DataFrame
new_students =  {
                "students":["Clara", "Johny", "Michael"],
                "born":[1984, 1989, 1920],
                "academic degree":["PhD.","Bc.","MSc."],
                "active":[True,False,False],
                "children":[2,0,4]
                }


In [10]:
df = df.append(pd.DataFrame(new_students), sort = False) #append the data

In [11]:
df #notice the index values

Unnamed: 0,students,born,academic degree,active,children
0,Adam,1994,,True,0
1,Monica,1989,Bc.,False,1
2,John,2011,MSc.,False,2
0,Clara,1984,PhD.,True,2
1,Johny,1989,Bc.,False,0
2,Michael,1920,MSc.,False,4


In [12]:
df.reset_index(inplace = True, drop = True) #Reset index values. Inplace rewrites df in place...
                                            #...without creating a copy as a new object
                                            #drop = false would insert a column "index"

In [13]:
df.drop(columns=["active", "children"]) #delete columns

Unnamed: 0,students,born,academic degree
0,Adam,1994,
1,Monica,1989,Bc.
2,John,2011,MSc.
3,Clara,1984,PhD.
4,Johny,1989,Bc.
5,Michael,1920,MSc.


In [14]:
df.drop([1]) #delete second row

Unnamed: 0,students,born,academic degree,active,children
0,Adam,1994,,True,0
2,John,2011,MSc.,False,2
3,Clara,1984,PhD.,True,2
4,Johny,1989,Bc.,False,0
5,Michael,1920,MSc.,False,4


### Accesing one column

In [15]:
born = df["born"].values #.values returns a numpy array
print(born)
type(born)

[1994 1989 2011 1984 1989 1920]


numpy.ndarray

In [16]:
degrees = df["academic degree"].values
print(degrees)
type(degrees)

[None 'Bc.' 'MSc.' 'PhD.' 'Bc.' 'MSc.']


numpy.ndarray

### Accesing one row

In [17]:
monica = df.iloc[1] #iloc means integer location
print(monica)
type(monica)
list(monica) #make a list of monica data

students           Monica
born                 1989
academic degree       Bc.
active              False
children                1
Name: 1, dtype: object


['Monica', 1989, 'Bc.', False, 1]

In [18]:
monica = df.loc[df["students"]=="Monica"] #acces a row via boolean expression
monica


Unnamed: 0,students,born,academic degree,active,children
1,Monica,1989,Bc.,False,1


### Accesing one element

In [19]:
df.iloc[1,2] #integer location again

'Bc.'

In [20]:
df.loc[df["students"]=="Monica","born"] #boolean location again


1    1989
Name: born, dtype: int64

In [21]:
df.loc[df["students"]=="Monica","born"].values #get a value of an element

array([1989], dtype=int64)

### Operations with DataFrame

In [22]:
df.describe()#returns basic statistics of numerical data in DataFrame

Unnamed: 0,born,children
count,6.0,6.0
mean,1981.166667,1.5
std,31.390551,1.516575
min,1920.0,0.0
25%,1985.25,0.25
50%,1989.0,1.5
75%,1992.75,2.0
max,2011.0,4.0


In [23]:
df.mean()

born        1981.166667
active         0.333333
children       1.500000
dtype: float64

In [24]:
df.std()

born        31.390551
active       0.516398
children     1.516575
dtype: float64

In [25]:
df.max()

students    Monica
born          2011
active        True
children         4
dtype: object

In [26]:
df.children.mean()

1.5

In [27]:
#Accesing specific data via boolean expression - mask
mask = df["born"] > df["born"].mean()
mask

0     True
1     True
2     True
3     True
4     True
5    False
Name: born, dtype: bool

In [28]:
df[mask]

Unnamed: 0,students,born,academic degree,active,children
0,Adam,1994,,True,0
1,Monica,1989,Bc.,False,1
2,John,2011,MSc.,False,2
3,Clara,1984,PhD.,True,2
4,Johny,1989,Bc.,False,0


In [29]:
df.sort_values(["children"]) #sorting data by number of children

Unnamed: 0,students,born,academic degree,active,children
0,Adam,1994,,True,0
4,Johny,1989,Bc.,False,0
1,Monica,1989,Bc.,False,1
2,John,2011,MSc.,False,2
3,Clara,1984,PhD.,True,2
5,Michael,1920,MSc.,False,4


In [30]:
df.sort_values(["children"], ascending = False) #descending

Unnamed: 0,students,born,academic degree,active,children
5,Michael,1920,MSc.,False,4
2,John,2011,MSc.,False,2
3,Clara,1984,PhD.,True,2
1,Monica,1989,Bc.,False,1
0,Adam,1994,,True,0
4,Johny,1989,Bc.,False,0


In [31]:
df.sort_values(["children","born"], ascending = [True, False]) #sorting by 2 categories

Unnamed: 0,students,born,academic degree,active,children
0,Adam,1994,,True,0
4,Johny,1989,Bc.,False,0
1,Monica,1989,Bc.,False,1
2,John,2011,MSc.,False,2
3,Clara,1984,PhD.,True,2
5,Michael,1920,MSc.,False,4


### Reading CSV

In [32]:
#there are many options in read_csv, study the documentation
url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
data = pd.read_csv(url) # coronavirus data

In [33]:
data.head(10) # if we have large data, watch only few rows with .head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,10/30/20,10/31/20,11/1/20,11/2/20,11/3/20,11/4/20,11/5/20,11/6/20,11/7/20,11/8/20
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,41334,41425,41501,41633,41728,41814,41935,41975,42033,42092
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,20634,20875,21202,21523,21904,22300,22721,23210,23705,24206
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,57651,57942,58272,58574,58979,59527,60169,60800,61381,62051
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,4665,4756,4825,4888,4910,5045,5135,5135,5319,5383
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,10558,10805,11035,11228,11577,11813,12102,12223,12335,12433
5,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,127,128,128,128,128,130,130,130,131,131
6,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,1157179,1166924,1173533,1183131,1195276,1205928,1217028,1228814,1236851,1242182
7,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,87432,89813,92254,93448,94776,97150,99563,101773,104249,106424
8,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,114,114,114,114,114,114,114,114,114,114
9,New South Wales,Australia,-33.8688,151.2093,0,0,0,0,3,4,...,4421,4425,4432,4435,4443,4445,4454,4459,4462,4469


In [34]:
mask = data["11/8/20"] == data["11/8/20"].max() #who has the biggest number on 8th of November?
data[mask]

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,10/30/20,10/31/20,11/1/20,11/2/20,11/3/20,11/4/20,11/5/20,11/6/20,11/7/20,11/8/20
244,,US,40.0,-100.0,1,1,2,2,5,5,...,9044278,9133404,9206998,9291087,9382617,9485448,9607336,9733816,9862228,9968155


## Interactive visualization using plotly

In [35]:
import plotly
import plotly.express as px

In [36]:
# world map visualization
fig = px.scatter_geo(data,                  # pandas dataframe
                     lat="Lat", lon="Long", # latitude and longitude 
                     size="11/8/20",        # size of markers are proportional to the number from 8th of November
                     projection="natural earth") # type of a world map projection - study other options

plotly.offline.plot(fig, filename='covid_worldmap.html') #save the interactive plot to html.
                                                         #you can open it in a web browser

'covid_worldmap.html'

In [37]:
# time series of USA (index 244 in dataframe)
data_time_series = data.iloc[244,4:-1]
data_time_series

1/22/20          1
1/23/20          1
1/24/20          2
1/25/20          2
1/26/20          5
            ...   
11/3/20    9382617
11/4/20    9485448
11/5/20    9607336
11/6/20    9733816
11/7/20    9862228
Name: 244, Length: 291, dtype: object

In [38]:
# interactive line visualization
fig2 = px.line(x=data_time_series.index, y=data_time_series.values)
plotly.offline.plot(fig2, filename='covid_time_series.html') #save the interactive plot to html.
                                                             #you can open it in a web browser

'covid_time_series.html'

### Subplots

In [39]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [40]:
# make instance of subplots
fig3 = make_subplots(rows=2, cols=1)

# add trace to row 1, col 1
fig3.add_trace(go.Scatter(x=data_time_series.index, y=data_time_series.values,
                          name="Line infections", showlegend=True),
               row=1, col=1)

# add trace to row 2, col 1
fig3.add_trace(go.Bar(x=data_time_series.index, y=data_time_series.values,
                     name="Bar infections", showlegend=True),
               row=2, col=1)

# you can manage figure parameters here
fig3.update_layout(width=800, title_text="Subplots")

# export to html
plotly.offline.plot(fig3, filename='subplots.html')

'subplots.html'