# Pandas

### helps to store data with index just like a book has index page using this we can navigate to the required page.

### Creating a Series.
#### Series is somewhat similar to numPy array. Only difference is Series has indexing

In [1]:
# creating pandas series with list

In [2]:
import pandas as pd

In [3]:
my_list = [10,200,58]

In [4]:
series = pd.Series(my_list)

In [5]:
series

0     10
1    200
2     58
dtype: int64

In [6]:
series.index

RangeIndex(start=0, stop=3, step=1)

In [7]:
series.values

array([ 10, 200,  58])

In [8]:
# Creating pandas Series with numPy Array

In [9]:
import numpy as np
import pandas as pd

index = ['a','b','c']
arr = np.array([10,39,122])

pd.Series(data=arr, index = index)

a     10
b     39
c    122
dtype: int64

In [10]:
#Creating pandas Series with dictionary

In [11]:
import pandas as pd

dict = {'a':45,'b':83,'c':78}
pd.Series(dict)

a    45
b    83
c    78
dtype: int64

In [12]:
# main puporse:1. in real scenarios data can come in diff. formats converting them to single pandas Series format. 2.Then transformation logic can be applied

# Imprtance of Index in Series
### Key to using Series is understanding its Index
### Pandas makes use of these index names or numbers by allowing for 𝒇𝒂𝒔𝒕 𝒍𝒐𝒐𝒌𝒖𝒑𝒔 of information (𝕨𝕠𝕣𝕜𝕤 𝕝𝕚𝕜𝕖 𝕙𝕒𝕤𝕙 𝕠𝕣 𝕕𝕚𝕔𝕥𝕚𝕠𝕟𝕒𝕣𝕪)

In [13]:
# custom index

In [14]:
import pandas as pd
ser1 = pd.Series(['rank1','rank2','rank3','rank4'],index=['India','Poland','Russia','Spain'])
ser2 = pd.Series(['rank5','rank6','rank7','rank8'],index=['Italy','Bhutan','Japan','Korea'])

print(ser1)

India     rank1
Poland    rank2
Russia    rank3
Spain     rank4
dtype: object


In [15]:
print(ser1+ser2)

Bhutan    NaN
India     NaN
Italy     NaN
Japan     NaN
Korea     NaN
Poland    NaN
Russia    NaN
Spain     NaN
dtype: object


# DataFrames

### Topics to touch upon:
#### How to Create a DataFrame, the primary data structure in pandas
#### How to find shape and rank of the created or existing DataFrames
#### How to read DataFrame from a file
#### What are indexes, and how do they work in domain of Pandas DataFrames

In [16]:
#creating a DataFrame manually
import pandas as pd
import numpy as np

df = pd.DataFrame([[1,2,3,4],
                  [4,5,2,5],
                  [6,7,8,4],
                  [9,8,1,5]])
df

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,4,5,2,5
2,6,7,8,4
3,9,8,1,5


In [17]:
type(df)

pandas.core.frame.DataFrame

In [18]:
print("Shape :",df.shape)
print("Index :",df.index)

Shape : (4, 4)
Index : RangeIndex(start=0, stop=4, step=1)


### 𝑼𝒔𝒊𝒏𝒈 𝒊𝒏𝒅𝒆𝒙 𝒊𝒏 𝑫𝒂𝒕𝒂𝑭𝒓𝒂𝒎𝒆𝒔

In [19]:
df2 = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9],[9,8,7]],
                  index=['a','b','c','d'], columns=['x','y','z'])

df2


Unnamed: 0,x,y,z
a,1,2,3
b,4,5,6
c,7,8,9
d,9,8,7


# Let's feel the coding rather than just typing
## The Weather Dataset: Reading DataFrames from Files

In [20]:
# Read the data in DataFrame

weather_df = pd.read_csv("weather_2012.csv")
print("Shape :",weather_df.shape)
print("Index :",weather_df.index)

Shape : (8784, 8)
Index : RangeIndex(start=0, stop=8784, step=1)


In [21]:
weather_df.head()

Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather
0,2012-01-01 00:00:00,-1.8,-3.9,86,4,8.0,101.24,Fog
1,2012-01-01 01:00:00,-1.8,-3.7,87,4,8.0,101.24,Fog
2,2012-01-01 02:00:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog"
3,2012-01-01 03:00:00,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog"
4,2012-01-01 04:00:00,-1.5,-3.3,88,7,4.8,101.23,Fog


In [22]:
# single column is called Series whereas multiple columns is called an DataFrame
weather_df['Date/Time'].head()
#type(weather_df['Date/Time'])

0    2012-01-01 00:00:00
1    2012-01-01 01:00:00
2    2012-01-01 02:00:00
3    2012-01-01 03:00:00
4    2012-01-01 04:00:00
Name: Date/Time, dtype: object

In [23]:
#Just by looking at data pandas is not able to determine datatype of data.
#Therefore let's convert the Data/Time column datatype from object to timestamp or datetime so that we can access
#the month directly using attribute dt.month.

In [24]:
weather_df['Date/Time'] = pd.to_datetime(weather_df['Date/Time']) #Here telling pandas that this is a datetime column. To perform basic datetime operation on Date/Time coloumn

In [25]:
weather_df['Date/Time'].head()

0   2012-01-01 00:00:00
1   2012-01-01 01:00:00
2   2012-01-01 02:00:00
3   2012-01-01 03:00:00
4   2012-01-01 04:00:00
Name: Date/Time, dtype: datetime64[ns]

# How to Analyze DataFrames?

### The following functions help us to understand and explore summaries of data without having to view the whole DataFrame.

# .info()
### Provides a summary of a DataFrame

In [26]:
weather_df.info() #for docstring of any python function just move your coursor inside paranthesis and do shift+tab


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date/Time           8784 non-null   datetime64[ns]
 1   Temp (C)            8784 non-null   float64       
 2   Dew Point Temp (C)  8784 non-null   float64       
 3   Rel Hum (%)         8784 non-null   int64         
 4   Wind Spd (km/h)     8784 non-null   int64         
 5   Visibility (km)     8784 non-null   float64       
 6   Stn Press (kPa)     8784 non-null   float64       
 7   Weather             8784 non-null   object        
dtypes: datetime64[ns](1), float64(4), int64(2), object(1)
memory usage: 549.1+ KB


# .head()
____________________________________________________________________________________

#### It is used to preview a part of a large DataFrame, similar to linux head command.This reduces time and resources required if whole DataFrame was to be fetched instead.Shows the first N rows in the Data(by default, N=5).

In [27]:
weather_df.head(5)

Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather
0,2012-01-01 00:00:00,-1.8,-3.9,86,4,8.0,101.24,Fog
1,2012-01-01 01:00:00,-1.8,-3.7,87,4,8.0,101.24,Fog
2,2012-01-01 02:00:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog"
3,2012-01-01 03:00:00,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog"
4,2012-01-01 04:00:00,-1.5,-3.3,88,7,4.8,101.23,Fog


In [28]:
weather_df.tail() #gives last 5 contents of DataFrame

Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather
8779,2012-12-31 19:00:00,0.1,-2.7,81,30,9.7,100.13,Snow
8780,2012-12-31 20:00:00,0.2,-2.4,83,24,9.7,100.03,Snow
8781,2012-12-31 21:00:00,-0.5,-1.5,93,28,4.8,99.95,Snow
8782,2012-12-31 22:00:00,-0.2,-1.8,89,28,9.7,99.91,Snow
8783,2012-12-31 23:00:00,0.0,-2.1,86,30,11.3,99.89,Snow


# .index
_____________________________________________________________________________________

##### This attribute proved the index of the dataframe.

##### Indexing identifies data using know indicators that allows intuitive getting and setting of subsets of the data set.

##### A major advantage of Pandas over NumPy is that each of the columns and rows has a label. Wroking with column positions is possible, but it can be hard to keep track of which number corresponds to which column.

##### We can work with labels using the 𝒑𝒂𝒏𝒅𝒂𝒔.𝑫𝒂𝒕𝒂𝑭𝒓𝒂𝒎𝒆.𝒍𝒐𝒄 method, which allows us to index using labels intead of positions.

In [29]:
weather_df.index

RangeIndex(start=0, stop=8784, step=1)

# .unique()
_______________________________________________________________________________________

##### This method, which belongs to the Series object, can be useful when trying to identify unique values in a column.

##### Uniques are returned inorder of appearance
##### It is significantly faster than numpy.unique and includes n/a values

In [30]:
weather_df['Weather']

0                        Fog
1                        Fog
2       Freezing Drizzle,Fog
3       Freezing Drizzle,Fog
4                        Fog
                ...         
8779                    Snow
8780                    Snow
8781                    Snow
8782                    Snow
8783                    Snow
Name: Weather, Length: 8784, dtype: object

In [31]:
weather_df['Weather'].unique()

array(['Fog', 'Freezing Drizzle,Fog', 'Mostly Cloudy', 'Cloudy', 'Rain',
       'Rain Showers', 'Mainly Clear', 'Snow Showers', 'Snow', 'Clear',
       'Freezing Rain,Fog', 'Freezing Rain', 'Freezing Drizzle',
       'Rain,Snow', 'Moderate Snow', 'Freezing Drizzle,Snow',
       'Freezing Rain,Snow Grains', 'Snow,Blowing Snow', 'Freezing Fog',
       'Haze', 'Rain,Fog', 'Drizzle,Fog', 'Drizzle',
       'Freezing Drizzle,Haze', 'Freezing Rain,Haze', 'Snow,Haze',
       'Snow,Fog', 'Snow,Ice Pellets', 'Rain,Haze', 'Thunderstorms,Rain',
       'Thunderstorms,Rain Showers', 'Thunderstorms,Heavy Rain Showers',
       'Thunderstorms,Rain Showers,Fog', 'Thunderstorms',
       'Thunderstorms,Rain,Fog',
       'Thunderstorms,Moderate Rain Showers,Fog', 'Rain Showers,Fog',
       'Rain Showers,Snow Showers', 'Snow Pellets', 'Rain,Snow,Fog',
       'Moderate Rain,Fog', 'Freezing Rain,Ice Pellets,Fog',
       'Drizzle,Ice Pellets,Fog', 'Drizzle,Snow', 'Rain,Ice Pellets',
       'Drizzle,Snow,Fog', 

In [32]:
weather_df['Wind Spd (km/h)'].unique()

array([ 4,  7,  6,  9, 15, 13, 20, 22, 19, 24, 30, 35, 39, 32, 33, 26, 44,
       43, 48, 37, 28, 17, 11,  0, 83, 70, 57, 46, 41, 52, 50, 63, 54,  2])

# .nunique()
________________________________________________________________________________________
#### This method belongs to the Series object and can be useful when tryting to identify the number of unique values in a column.
#### Exuldes NA vlaues by default
#### Always returns an integer value

In [33]:
weather_df['Weather'].nunique()

50

In [34]:
len(weather_df['Weather'].unique())

50

# .value_counts()
________________________________________________________________________________________

#### This method also belongs to Series object, can be useful when trying to idenfity unique values and their counts in a coloumn

The sefulting object will be in descending order so that the first element is the most frequently-occurring element.

Exludes NA vlaues by default.

In [35]:
weather_df['Weather'].value_counts()

Weather
Mainly Clear                               2106
Mostly Cloudy                              2069
Cloudy                                     1728
Clear                                      1326
Snow                                        390
Rain                                        306
Rain Showers                                188
Fog                                         150
Rain,Fog                                    116
Drizzle,Fog                                  80
Snow Showers                                 60
Drizzle                                      41
Snow,Fog                                     37
Snow,Blowing Snow                            19
Rain,Snow                                    18
Thunderstorms,Rain Showers                   16
Haze                                         16
Drizzle,Snow,Fog                             15
Freezing Rain                                14
Freezing Drizzle,Snow                        11
Freezing Drizzle                

In [36]:
weather_df['Weather'].value_counts()[3]

  weather_df['Weather'].value_counts()[3]


np.int64(1326)

In [37]:
weather_df['Wind Spd (km/h)'].value_counts()

Wind Spd (km/h)
9     830
11    791
13    735
15    719
7     677
17    666
19    616
6     609
20    496
4     474
22    439
24    374
0     309
26    242
28    205
30    161
32    139
33     85
35     53
37     45
39     24
41     22
44     14
43     13
48     13
46     11
52      7
57      5
50      4
2       2
70      1
83      1
63      1
54      1
Name: count, dtype: int64

In [38]:
weather_df['Wind Spd (km/h)'].value_counts()[4]

np.int64(474)

# Data Manipulation: Gets you desired results

# Selection(Part 1)

### How do you select particular rows/columns from the DataFrame?

The DataFrame object supports indexing operations just like the Python list and class and
the Pandas Series object, but is much faster and more powerful.

Note: When we extract a single row or column, we get a one-dimensional object as output. That is called a pandas Series. The valuees on the left are just labels taken from the dataframe index.

𝓞𝓷 𝓽𝓱𝓮 𝓸𝓽𝓱𝓮𝓻 𝓱𝓪𝓷𝓭, 𝔀𝓱𝓮𝓷 𝔀𝓮 𝓮𝔁𝓽𝓻𝓪𝓬𝓽 𝓹𝓸𝓻𝓽𝓲𝓸𝓷𝓼 𝓸𝓯 𝓪 𝓹𝓪𝓷𝓭𝓪𝓼 𝓭𝓪𝓽𝓪𝓻𝓪𝓶𝓮, 𝔀𝓮 𝓰𝓮𝓽 𝓪 𝟐𝓓 𝓓𝓪𝓽𝓪𝓕𝓻𝓪𝓶𝓮 𝓽𝔂𝓹𝓮 𝓸𝓯 𝓸𝓫𝓳𝓮𝓬𝓽. 𝓢𝓸𝓶𝓮𝓽𝓱𝓲𝓷𝓰 𝓽𝓸 𝓴𝓮𝓮𝓹 𝓲𝓷 𝓶𝓲𝓷𝓭 𝓯𝓸𝓻 𝓵𝓪𝓽𝓮𝓻.

_______________________________________________________________________________________
### How to get the Weather from the "weather_df" dataframe

In [39]:
col = weather_df['Weather']

print(type(col))
col.head()

<class 'pandas.core.series.Series'>


0                     Fog
1                     Fog
2    Freezing Drizzle,Fog
3    Freezing Drizzle,Fog
4                     Fog
Name: Weather, dtype: object

### How to get the Weather and Temperature Columns from the "weather_df" dataframe
_______________________________________________________________________________________

In [40]:
# Since we now want to extract two columns from dataframe then we need to pass a list
#This needs to be done whenever we want to extract more than one column
two_cols = weather_df[['Weather','Temp (C)']]

print(type(two_cols))
two_cols.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Weather,Temp (C)
0,Fog,-1.8
1,Fog,-1.8
2,"Freezing Drizzle,Fog",-1.8
3,"Freezing Drizzle,Fog",-1.5
4,Fog,-1.5


## Get the first 25 rows from the "weather_df" dataframe
____________________________________________________________________________________
### Important :
This slicing would work even if the row index had non-numeric labels, because slicing works here the same way as a list

In [41]:
weather_df[:25]

Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather
0,2012-01-01 00:00:00,-1.8,-3.9,86,4,8.0,101.24,Fog
1,2012-01-01 01:00:00,-1.8,-3.7,87,4,8.0,101.24,Fog
2,2012-01-01 02:00:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog"
3,2012-01-01 03:00:00,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog"
4,2012-01-01 04:00:00,-1.5,-3.3,88,7,4.8,101.23,Fog
5,2012-01-01 05:00:00,-1.4,-3.3,87,9,6.4,101.27,Fog
6,2012-01-01 06:00:00,-1.5,-3.1,89,7,6.4,101.29,Fog
7,2012-01-01 07:00:00,-1.4,-3.6,85,7,8.0,101.26,Fog
8,2012-01-01 08:00:00,-1.4,-3.6,85,9,8.0,101.23,Fog
9,2012-01-01 09:00:00,-1.3,-3.1,88,15,4.0,101.2,Fog


## How to get the first 3 alternating rows from the "weather_df" dataframe, but only the Visibility and Relative Humidity columns

In [42]:
weather_df[0:6:2][['Rel Hum (%)','Visibility (km)']]

Unnamed: 0,Rel Hum (%),Visibility (km)
0,86,8.0
2,89,4.0
4,88,4.8


In [43]:
weather_df[['Rel Hum (%)','Visibility (km)']][0:6:2]
#This ensures slicing part can be written as above two ways

Unnamed: 0,Rel Hum (%),Visibility (km)
0,86,8.0
2,89,4.0
4,88,4.8


# The main learning Part starts:

### So which of the two solutions should you use?
_________________________________________________________________________________________
Answer: Neither. Because we're indexing more than once (Chained Indexing). When you use chained indexing, the order and type of the indexing operation partially determine whether the result is a slice into the original object, or a copy of the slice.

Let's analyse (break down) one of the above solutions.

In [44]:
# first indexing
df1 = weather_df[:6:2]

# second indexing
df2 = df1[['Rel Hum (%)', 'Visibility (km)']]

# Selection (Part 2)
_____________________________________________________________________________________
Pandas provides a powerful way to work with both rows and columns together, optionally using their label indices or numeric indices.

.loc :

.iloc:


In [45]:
weather_df.loc[0:5,['Visibility (km)','Rel Hum (%)']]

Unnamed: 0,Visibility (km),Rel Hum (%)
0,8.0,86
1,8.0,87
2,4.0,89
3,4.0,88
4,4.8,88
5,6.4,87


In [46]:
weather_df[['Visibility (km)','Rel Hum (%)']].iloc[0:5]

Unnamed: 0,Visibility (km),Rel Hum (%)
0,8.0,86
1,8.0,87
2,4.0,89
3,4.0,88
4,4.8,88


In [47]:
weather_df.loc[:5:2 ,['Visibility (km)','Rel Hum (%)']]

Unnamed: 0,Visibility (km),Rel Hum (%)
0,8.0,86
2,4.0,89
4,4.8,88


# Filtering

In [48]:
# finding all instances when snow was recoreded
weather_df['Weather'].unique()

array(['Fog', 'Freezing Drizzle,Fog', 'Mostly Cloudy', 'Cloudy', 'Rain',
       'Rain Showers', 'Mainly Clear', 'Snow Showers', 'Snow', 'Clear',
       'Freezing Rain,Fog', 'Freezing Rain', 'Freezing Drizzle',
       'Rain,Snow', 'Moderate Snow', 'Freezing Drizzle,Snow',
       'Freezing Rain,Snow Grains', 'Snow,Blowing Snow', 'Freezing Fog',
       'Haze', 'Rain,Fog', 'Drizzle,Fog', 'Drizzle',
       'Freezing Drizzle,Haze', 'Freezing Rain,Haze', 'Snow,Haze',
       'Snow,Fog', 'Snow,Ice Pellets', 'Rain,Haze', 'Thunderstorms,Rain',
       'Thunderstorms,Rain Showers', 'Thunderstorms,Heavy Rain Showers',
       'Thunderstorms,Rain Showers,Fog', 'Thunderstorms',
       'Thunderstorms,Rain,Fog',
       'Thunderstorms,Moderate Rain Showers,Fog', 'Rain Showers,Fog',
       'Rain Showers,Snow Showers', 'Snow Pellets', 'Rain,Snow,Fog',
       'Moderate Rain,Fog', 'Freezing Rain,Ice Pellets,Fog',
       'Drizzle,Ice Pellets,Fog', 'Drizzle,Snow', 'Rain,Ice Pellets',
       'Drizzle,Snow,Fog', 

In [49]:
snowed_filter = weather_df['Weather'].str.lower().str.contains('snow')

In [50]:
snowed_filter  # now our snowed_filter is an boolean array



0       False
1       False
2       False
3       False
4       False
        ...  
8779     True
8780     True
8781     True
8782     True
8783     True
Name: Weather, Length: 8784, dtype: bool

In [51]:
snowed_filter[8779]

np.True_

In [52]:
weather_df[snowed_filter]

Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather
41,2012-01-02 17:00:00,-2.1,-9.5,57,22,25.0,99.66,Snow Showers
44,2012-01-02 20:00:00,-5.6,-13.4,54,24,25.0,100.07,Snow Showers
45,2012-01-02 21:00:00,-5.8,-12.8,58,26,25.0,100.15,Snow Showers
47,2012-01-02 23:00:00,-7.4,-14.1,59,17,19.3,100.27,Snow Showers
48,2012-01-03 00:00:00,-9.0,-16.0,57,28,25.0,100.35,Snow Showers
...,...,...,...,...,...,...,...,...
8779,2012-12-31 19:00:00,0.1,-2.7,81,30,9.7,100.13,Snow
8780,2012-12-31 20:00:00,0.2,-2.4,83,24,9.7,100.03,Snow
8781,2012-12-31 21:00:00,-0.5,-1.5,93,28,4.8,99.95,Snow
8782,2012-12-31 22:00:00,-0.2,-1.8,89,28,9.7,99.91,Snow


In [53]:
#finding all instances when wind speed was above 24 and visibility was 25
df = weather_df[(weather_df['Wind Spd (km/h)']>24) & (weather_df['Visibility (km)'] == 25)]
df.head(20)

Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather
23,2012-01-01 23:00:00,5.3,2.0,79,30,25.0,99.31,Cloudy
24,2012-01-02 00:00:00,5.2,1.5,77,35,25.0,99.26,Rain Showers
25,2012-01-02 01:00:00,4.6,0.0,72,39,25.0,99.26,Cloudy
26,2012-01-02 02:00:00,3.9,-0.9,71,32,25.0,99.26,Mostly Cloudy
27,2012-01-02 03:00:00,3.7,-1.5,69,33,25.0,99.3,Mostly Cloudy
28,2012-01-02 04:00:00,2.9,-2.3,69,32,25.0,99.26,Mostly Cloudy
29,2012-01-02 05:00:00,2.6,-2.3,70,32,25.0,99.21,Mostly Cloudy
30,2012-01-02 06:00:00,2.3,-2.6,70,26,25.0,99.18,Mostly Cloudy
31,2012-01-02 07:00:00,2.0,-2.9,70,33,25.0,99.14,Mostly Cloudy
42,2012-01-02 18:00:00,-4.1,-11.4,57,28,25.0,99.86,Mostly Cloudy


# Working with columns
______________________________________________________________________________
learning:
1. How to carry out Series opertations on DataFrame Columns
2. How to add or update columns within a DataFrame
3. How to rename specific columns
4. How to delete or drop a column that is no longer required for analysis

### Series Operations
A series is a one-dimensional ndarry with axis labels(including time series)

In [54]:
weather_df['Wind Spd (km/h)'].head() /2

0    2.0
1    2.0
2    3.5
3    3.0
4    3.5
Name: Wind Spd (km/h), dtype: float64

In [55]:
add_10 = weather_df['Wind Spd (km/h)'] +100
add_10.head(8000)

0       104
1       104
2       107
3       106
4       107
       ... 
7995    117
7996    109
7997    106
7998    113
7999    111
Name: Wind Spd (km/h), Length: 8000, dtype: int64

In [56]:
mult_2 = weather_df['Visibility (km)'] *2
mult_2.head()

0    16.0
1    16.0
2     8.0
3     8.0
4     9.6
Name: Visibility (km), dtype: float64

In [57]:
weather_df['Temp (C)']

0      -1.8
1      -1.8
2      -1.8
3      -1.5
4      -1.5
       ... 
8779    0.1
8780    0.2
8781   -0.5
8782   -0.2
8783    0.0
Name: Temp (C), Length: 8784, dtype: float64

In [58]:
weather_df['Dew Point Temp (C)']

0      -3.9
1      -3.7
2      -3.4
3      -3.2
4      -3.3
       ... 
8779   -2.7
8780   -2.4
8781   -1.5
8782   -1.8
8783   -2.1
Name: Dew Point Temp (C), Length: 8784, dtype: float64

In [59]:
weather_df['new_temp-col'] = weather_df['Temp (C)'] + weather_df['Dew Point Temp (C)']

In [60]:
weather_df.head()

Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather,new_temp-col
0,2012-01-01 00:00:00,-1.8,-3.9,86,4,8.0,101.24,Fog,-5.7
1,2012-01-01 01:00:00,-1.8,-3.7,87,4,8.0,101.24,Fog,-5.5
2,2012-01-01 02:00:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog",-5.2
3,2012-01-01 03:00:00,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog",-4.7
4,2012-01-01 04:00:00,-1.5,-3.3,88,7,4.8,101.23,Fog,-4.8


### Some built-in functions of pandas

### Apply/Call Functions

#### .apply()
.apply() function will help us apply logic(function) on entire column of dataframe.

.apply() passes the entire column to the logic(function) that we want to apply on column/ or whenever we want to do bulk operation on a column.

In [61]:
def times2(value):
    return value * 2


In [62]:
weather_df['Visibility (km)'].head()

0    8.0
1    8.0
2    4.0
3    4.0
4    4.8
Name: Visibility (km), dtype: float64

In [63]:
weather_df['Visibility (km)'].apply(times2).head()

0    16.0
1    16.0
2     8.0
3     8.0
4     9.6
Name: Visibility (km), dtype: float64

### .describe()
Is used to summarize the central tendency, dispersion and shape of a dataset's distribution, excluding Nan values.

In [64]:
weather_df['Weather'].describe()

count             8784
unique              50
top       Mainly Clear
freq              2106
Name: Weather, dtype: object

weather_df['Visibility (km)'].describe()

In [65]:
weather_df['Date/Time'].describe()

count                   8784
mean     2012-07-01 23:30:00
min      2012-01-01 00:00:00
25%      2012-04-01 11:45:00
50%      2012-07-01 23:30:00
75%      2012-10-01 11:15:00
max      2012-12-31 23:00:00
Name: Date/Time, dtype: object

In [66]:
weather_df[['Temp (C)','Dew Point Temp (C)']].describe()

Unnamed: 0,Temp (C),Dew Point Temp (C)
count,8784.0,8784.0
mean,8.798144,2.555294
std,11.687883,10.883072
min,-23.3,-28.5
25%,0.1,-5.9
50%,9.3,3.3
75%,18.8,11.8
max,33.0,24.4


### Adding/Updating Columns

In [67]:
visibility_in_meters = weather_df['Visibility (km)'] * 1000
weather_df['Visibilitiy (m)'] = visibility_in_meters

weather_df.head()

Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather,new_temp-col,Visibilitiy (m)
0,2012-01-01 00:00:00,-1.8,-3.9,86,4,8.0,101.24,Fog,-5.7,8000.0
1,2012-01-01 01:00:00,-1.8,-3.7,87,4,8.0,101.24,Fog,-5.5,8000.0
2,2012-01-01 02:00:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog",-5.2,4000.0
3,2012-01-01 03:00:00,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog",-4.7,4000.0
4,2012-01-01 04:00:00,-1.5,-3.3,88,7,4.8,101.23,Fog,-4.8,4800.0


### Renaming Columns

#### .rename()
Alter Series index labels or name. It will replace the existing names with the names you provide, in the order you provide .
You can also assighn the names by index.
The rename() method allows you to relabel an axis based on some mapping (a dict or Series) or an arbitary function.

In [68]:
# The "inplace=True" parameter. This means the renaming has been assigned in the old DataFrame itself
# and if parameter not used then we need to assign renaming to new variable then change the name

weather_df.rename(columns={'Visibility (m)': 'Visibility(meters)'}, inplace=True)
weather_df.head()

Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather,new_temp-col,Visibilitiy (m)
0,2012-01-01 00:00:00,-1.8,-3.9,86,4,8.0,101.24,Fog,-5.7,8000.0
1,2012-01-01 01:00:00,-1.8,-3.7,87,4,8.0,101.24,Fog,-5.5,8000.0
2,2012-01-01 02:00:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog",-5.2,4000.0
3,2012-01-01 03:00:00,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog",-4.7,4000.0
4,2012-01-01 04:00:00,-1.5,-3.3,88,7,4.8,101.23,Fog,-4.8,4800.0


### Deleting Columns
#### .drop()
Return new object with labels in requested axis removed.
Note that Pandas uses zero based numbering, so 0 is the first row, 1 is the second row, etc. You can select ranges relative to the top or drop relative to the bottom of the DataFrame as well.

Note: Specifiying both labels and index or columns will raise a ValueError

In [69]:
# Since we have not mentioned inplace=True, it returns a new dataframe.
weather_df = weather_df.drop(labels=['Visibility (meters)'], axis=1) # 1 = column, 0 = row

KeyError: "['Visibility (meters)'] not found in axis"

In [None]:
weather_df.head()

## Sorting

### .sort_values()

Sort by the values along either axis, in a user specified order. The order can be specified by selecting true or flase for the 'ascending' parameter

In [None]:
sorted_by_temp = weather_df.sort_values('Temp (C)',ascending=True) # can be inplace as well
sorted_by_temp.head()

In [None]:
weather_df.sort_values('Wind Spd (km/h)', ascending=True, inplace=True)
weather_df.head()

# Which were the top 10 hottest values and their counts?

In [None]:
top10_hot_weather = weather_df['Temp (C)'].value_counts().sort_values(ascending=False)

In [None]:
top10_hot_weather.iloc[:10]

#____________________________________________________________________________________

# Pivot Tables : Excellent way to Summarize your Data!

A pivot table is a tool that allows you to reorganize and summarize selected columns and rows of data in a dataframe.

Pivot tables provide an easy way to subset by one column and then apply a calculation like a sum or a mean

Pivot tables first groups and only then applies a calculation.

In [None]:
import pandas as pd
import numpy as np
data = {
    'A':['foo','foo','foo','bar','bar','bar'],
    'B':['one','one','two','two','one','one'],
    'C':['x','y','x','y','x','y'],
    'D':[1,3,2,5,4,1]
}

df = pd.DataFrame(data)
df

In [None]:
pivot_df = df.pivot_table(
                values = 'D',     #We want to aggregate the values of which column?
                index = 'A',      # We want to use which clumn as the new index?
                columns = ['C'],  #We want to use the values of which column as the new Columns? (optional)
                aggfunc = np.sum)
pivot_df
                     

In [None]:
#converting it back to a simple index

pivot_df.reset_index()

# What is the mean temperature recorded by month?
Use case of pivot table

In [None]:
weather_df[['Temp (C)','Date/Time']]

In [None]:
weather_df['Date/Time'].dt.month.loc[1000:2190]

In [None]:
mean_temp_df = weather_df.pivot_table(
                            values = 'Temp (C)',
                            index = weather_df['Date/Time'].dt.month,
                            aggfunc = np.mean)
mean_temp_df

# Group By
_______________________________________________________________________________________

In [70]:
import pandas as pd
import numpy as np

#weather_df.groupby(weather_df['Date/Time'].dt.month).agg(np.mean)['Temp (C)']
weather_df.groupby(weather_df['Date/Time'].dt.month)['Temp (C)'].mean()


Date/Time
1     -7.371505
2     -4.225000
3      3.121237
4      7.009306
5     16.237769
6     20.134028
7     22.790054
8     22.279301
9     16.484444
10    10.954973
11     0.931389
12    -3.306317
Name: Temp (C), dtype: float64

In [71]:
mean_temp_df2 = weather_df.groupby(weather_df['Date/Time'].dt.month).mean()
mean_temp_df2.head()                                    

TypeError: agg function failed [how->mean,dtype->object]

# Concat, Merge and Join

### Concatenaton : 
Concatenation glues together DataFrames.Keep in mind that dimensions should match along the axis you are concatenating on.

### Merging: 
The merge function allows you to mrge DataFrames together using a similar logic as merging SWL Tables together

### Join: 
Join is a convenient method for combining the columns of two potentially differtly-indexed DataFrames into a single result DataFrame.

In [3]:
import pandas as pd
df1 = pd.DataFrame({
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']
}, index=[0, 1, 2, 3])

df2 = pd.DataFrame({
    'A': ['A4', 'A5', 'A6', 'A7'],
    'B': ['B4', 'B5', 'B6', 'B7'],
    'C': ['C4', 'C5', 'C6', 'C7'],
    'D': ['D4', 'D5', 'D6', 'D7']
}, index=[4, 5, 6, 7])

df3 = pd.DataFrame({
    'A': ['A8', 'A9', 'A10', 'A11'],
    'B': ['B8', 'B9', 'B10', 'B11'],
    'C': ['C8', 'C9', 'C10', 'C11'],
    'E': ['D8', 'D9', 'D10', 'D11']
}, index=[8, 9, 10, 11])

In [73]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [74]:
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [75]:
# Since i didn't specify an axis, it defaults to axis = 0, which means it appends to rows
pd.concat([df1,df2,df3])

Unnamed: 0,A,B,C,D,E
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
4,A4,B4,C4,D4,
5,A5,B5,C5,D5,
6,A6,B6,C6,D6,
7,A7,B7,C7,D7,
8,A8,B8,C8,,D8
9,A9,B9,C9,,D9


In [4]:
# Now if i specifiy axis = 1 means concat along columns
pd.concat([df1,df2,df3],axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,E
0,A0,B0,C0,D0,,,,,,,,
1,A1,B1,C1,D1,,,,,,,,
2,A2,B2,C2,D2,,,,,,,,
3,A3,B3,C3,D3,,,,,,,,
4,,,,,A4,B4,C4,D4,,,,
5,,,,,A5,B5,C5,D5,,,,
6,,,,,A6,B6,C6,D6,,,,
7,,,,,A7,B7,C7,D7,,,,
8,,,,,,,,,A8,B8,C8,D8
9,,,,,,,,,A9,B9,C9,D9


# Join 

In [7]:
left_df = pd.DataFrame({
    'A': ['A0', 'A1', 'A2'],
    'B': ['B0', 'B1', 'B2']
}, index = ['K0','K1','K2'])

right_df = pd.DataFrame({
    'C': ['C0', 'C2', 'C3'],
    'D': ['D0', 'D2', 'D3']
}, index = ['K0','K2','K3'])

In [8]:
left_df

Unnamed: 0,A,B
K0,A0,B0
K1,A1,B1
K2,A2,B2


In [9]:
right_df

Unnamed: 0,C,D
K0,C0,D0
K2,C2,D2
K3,C3,D3


In [11]:
left_df.join(right_df) # default is outer join

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2


In [12]:
left_df.join(right_df, how='inner')

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K2,A2,B2,C2,D2


# Merge

Many a times you will be working with multiple dataframes all at once.

The merge function allows them to be combined into a single data frame

In [13]:
# Merging on multiple keys
left = pd.DataFrame({
    'key1': ['K0', 'K0', 'K1', 'K2'],
    'key2': ['K0', 'K1', 'K0', 'K1'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']
})
    
right = pd.DataFrame({
    'key1': ['K0', 'K1', 'K1', 'K2'],
    'key2': ['K0', 'K0', 'K0', 'K0'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']
})

In [14]:
left

Unnamed: 0,key1,key2,A,B
0,K0,K0,A0,B0
1,K0,K1,A1,B1
2,K1,K0,A2,B2
3,K2,K1,A3,B3


In [16]:
right

Unnamed: 0,key1,key2,C,D
0,K0,K0,C0,D0
1,K1,K0,C1,D1
2,K1,K0,C2,D2
3,K2,K0,C3,D3


In [20]:
pd.merge(left,right,how='left',on=['key1','key2'])

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K0,K1,A1,B1,,
2,K1,K0,A2,B2,C1,D1
3,K1,K0,A2,B2,C2,D2
4,K2,K1,A3,B3,,
