[Reference](https://blog.devgenius.io/pandas-a-complete-guide-for-data-science-and-machine-learning-projects-8d042751ef53)

In [1]:
import numpy as np
import pandas as pd

In [2]:
list_1 = ['a', 'b', 'c', 'd', 'e']
labels = [1, 2, 3, 4, 5]
series_1 = pd.Series(data = list_1, index = labels)
series_1

1    a
2    b
3    c
4    d
5    e
dtype: object

In [3]:
dict_1 = {'first_name': 'Sourav', 'last_name': 'Shrivas', 'age': 25}
series_2 = pd.Series(dict_1)
series_2

first_name     Sourav
last_name     Shrivas
age                25
dtype: object

In [4]:
dict_1 = {'first_name': 'Sourav', 'last_name': 'Shrivas', 'age': 25}
series_2 = pd.Series(dict_1, name='details')
series_2

first_name     Sourav
last_name     Shrivas
age                25
Name: details, dtype: object

In [5]:
arr_1 = np.random.randint(10, 50, size=(2, 3))
arr_1

array([[20, 13, 47],
       [13, 33, 37]])

In [6]:
df_1 = pd.DataFrame(arr_1, ['First Row', 'Second Row'], ['First Column', 'Second Column', 'Third Column'])
df_1

Unnamed: 0,First Column,Second Column,Third Column
First Row,20,13,47
Second Row,13,33,37


In [7]:
dict_2 = {'one': pd.Series([1., 2., 3.]),
         'two': pd.Series([1., 2., 3., 4.])}
df_2 = pd.DataFrame(dict_2)
df_2

Unnamed: 0,one,two
0,1.0,1.0
1,2.0,2.0
2,3.0,3.0
3,,4.0


In [8]:
df_3 = pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])]))
df_3

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [9]:
df_4 = pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])]),
                             orient='index', columns=['one', 'two', 'three'])
df_4

Unnamed: 0,one,two,three
A,1,2,3
B,4,5,6


In [10]:
df_1

Unnamed: 0,First Column,Second Column,Third Column
First Row,20,13,47
Second Row,13,33,37


In [11]:
df_1['First Column']

First Row     20
Second Row    13
Name: First Column, dtype: int64

In [12]:
df_1[['First Column', 'Second Column']]

Unnamed: 0,First Column,Second Column
First Row,20,13
Second Row,13,33


In [13]:
df_1.loc['First Row']

First Column     20
Second Column    13
Third Column     47
Name: First Row, dtype: int64

In [14]:
df_1.iloc[1]

First Column     13
Second Column    33
Third Column     37
Name: Second Row, dtype: int64

In [15]:
df_1.loc['First Row', 'Second Column']

13

In [16]:
df_1.loc[['First Row', 'Second Row'], ['First Column', 'Third Column']]

Unnamed: 0,First Column,Third Column
First Row,20,47
Second Row,13,37


In [17]:
df_1['Total'] = df_1['First Column'] + df_1['Second Column'] + df_1['Third Column']
df_1

Unnamed: 0,First Column,Second Column,Third Column,Total
First Row,20,13,47,80
Second Row,13,33,37,83


In [18]:
dict_3 = {'First Column': 78, 'Second Column': 88, 'Third Column': 98}
new_row = pd.Series(dict_3, name='Third Row')
df_1 = df_1.append(new_row)
df_1

Unnamed: 0,First Column,Second Column,Third Column,Total
First Row,20.0,13.0,47.0,80.0
Second Row,13.0,33.0,37.0,83.0
Third Row,78.0,88.0,98.0,


In [19]:
df_1.drop('Total', axis=1, inplace=True)
df_1

Unnamed: 0,First Column,Second Column,Third Column
First Row,20.0,13.0,47.0
Second Row,13.0,33.0,37.0
Third Row,78.0,88.0,98.0


In [20]:
df_1.drop('Third Row', axis=0, inplace=True)
df_1

Unnamed: 0,First Column,Second Column,Third Column
First Row,20.0,13.0,47.0
Second Row,13.0,33.0,37.0


In [21]:
df_1['Sex'] = ['Male', 'Female']
df_1.set_index('Sex', inplace=True)

In [22]:
df_1

Unnamed: 0_level_0,First Column,Second Column,Third Column
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,20.0,13.0,47.0
Female,13.0,33.0,37.0


In [23]:
df_5 = pd.DataFrame({'A': [1., np.nan, 2., np.nan]})
df_6 = pd.DataFrame({'A': [10., 11., 12., 13. ]})
df_5.combine_first(df_6)

Unnamed: 0,A
0,1.0
1,11.0
2,2.0
3,13.0


In [24]:
df_1['First Column'].unique()

array([20., 13.])

In [25]:
df_1['First Column'].nunique()

2

In [26]:
df_1['First Column'].value_counts()

20.0    1
13.0    1
Name: First Column, dtype: int64

In [27]:
df_1.columns

Index(['First Column', 'Second Column', 'Third Column'], dtype='object')

In [28]:
dict_3 = {'Store': [1,2,1,2], 'Flavor': ['Choc', 'Van', 'Straw', 'Choc'], 
         'Sales': [26, 12, 18, 22]}

In [29]:
df_5 = pd.DataFrame(dict_3)
df_5

Unnamed: 0,Store,Flavor,Sales
0,1,Choc,26
1,2,Van,12
2,1,Straw,18
3,2,Choc,22


In [30]:
by_store = df_5.groupby('Store')
by_store.mean()

Unnamed: 0_level_0,Sales
Store,Unnamed: 1_level_1
1,22.0
2,17.0


In [31]:
by_store.sum().loc[1]

Sales    44
Name: 1, dtype: int64

In [32]:
by_store.describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Store,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,2.0,22.0,5.656854,18.0,20.0,22.0,24.0,26.0
2,2.0,17.0,7.071068,12.0,14.5,17.0,19.5,22.0


In [33]:
df_6 = pd.DataFrame({'A':[1,2,3], 'B':[4,5,6]}, index=[1,2,3])

df_7 = pd.DataFrame({'A':[7,8,9], 'B':[10,11,12]}, index=[4,5,6])

cancatenated = pd.concat([df_6, df_7])
cancatenated

Unnamed: 0,A,B
1,1,4
2,2,5
3,3,6
4,7,10
5,8,11
6,9,12


In [34]:
# Merge 2 DFs using their shared key column
df_8 = pd.DataFrame({'A': [1,2,3],
                     'B': [4,5,6],
                     'key': [1,2,3]})
df_9 = pd.DataFrame({'A': [7,8,9],
                     'B': [10,11,12],
                     'key': [1,2,3]})
# inner merges at the intersection of keys
inner_merge = pd.merge(df_8, df_9, how='inner', on='key')
inner_merge
# how='left' or 'right' : Use keys from left or right frame
# how='outer' : Use union of keys

Unnamed: 0,A_x,B_x,key,A_y,B_y
0,1,4,1,7,10
1,2,5,2,8,11
2,3,6,3,9,12


In [35]:
df_8 = pd.DataFrame({'A': [1,2,3],
                     'B': [4,5,6]},
                    index=[1,2,3])
df_9 = pd.DataFrame({'C': [7,8,9],
                     'D': [10,11,12]},
                    index=[1,4,5])
df_8.join(df_9, how='outer')

Unnamed: 0,A,B,C,D
1,1.0,4.0,7.0,10.0
2,2.0,5.0,,
3,3.0,6.0,,
4,,,8.0,11.0
5,,,9.0,12.0


In [36]:
# # Get ice cream sales data
# ics_df = pd.read_csv('icecreamsales.csv')
# ics_df
# ics_df.count()
# ics_df.sum()
# ics_df.sum(skipna=True)
# ics_df['Sales'].mean()
# ics_df['Sales'].median()
# ics_df['Sales'].max()
# ics_df['Sales'].min()
# ics_df['Sales'].std()
# ics_df['Sales'].skew()
# ics_df['Sales'].kurt()
# ics_df['Sales'].cumsum()
# ics_df.describe()

In [37]:
# import sys

# cs_df = pd.read_csv('ComputerSales.csv')
# cs_df.head()

# # You can pass DataFrames and Series into functions
# def get_profit_total(df):
#     prof_ser = df['Profit']
#     print(f"Total Profit : {prof_ser.sum()}")

# get_profit_total(cs_df)

# # Receives a DataFrame, splits the contact into new columns
# # being first and last name
# def split_name(df):
#     def get_names(full_name):
#         # Split contact at space
#         f_name, l_name = full_name.split()
#         # Create a series with first & last names in columns
#         # with those labels
#         return pd.Series(
#         (f_name, l_name),
#         index=['First Name', 'Last Name']
#         )
#     # apply() executes the function on all names in Contact column
#     names = df['Contact'].apply(get_names)
#     df[names.columns] = names
#     return df

# # Run function and display top 5 results
# split_name(cs_df).head()

# def create_age_groups(df):
#     # Must have 1 more bins than labels
#     bins = [0, 30, 50, sys.maxsize]
#     # Group labels
#     labels = ['<30', '30-50', '>50']
    
#     # cut puts values into certain groups based on intervals
#     # The group assigned to <30 has an age between 0 and 30
#     # between 30 & 50 is assigned 30-50 and so on
#     age_group = pd.cut(df['Age'], bins=bins, labels=labels)
#     # Create new column and return new dataframe info
#     df['Age Group'] = age_group
#     return df

# create_age_groups(cs_df)

# # You can use a pipe to pass a dataframe to multiple functions
# cs_df.pipe(split_name).pipe(create_age_groups).head()

In [38]:
ser_1 = pd.Series(range(5), index=['a', 'b', 'c', 'd', 'e'])

In [39]:
s_1 = ser_1[:4]
s_2 = ser_1[1:]
print(s_1)
print(s_2)

a    0
b    1
c    2
d    3
dtype: int64
b    1
c    2
d    3
e    4
dtype: int64


In [40]:
# Align both series by the union of their indexes
s_1.align(s_2)

(a    0.0
 b    1.0
 c    2.0
 d    3.0
 e    NaN
 dtype: float64, a    NaN
 b    1.0
 c    2.0
 d    3.0
 e    4.0
 dtype: float64)

In [41]:
# Align by calling series
s_1.align(s_2, join='left')

(a    0
 b    1
 c    2
 d    3
 dtype: int64, a    NaN
 b    1.0
 c    2.0
 d    3.0
 dtype: float64)

In [42]:
# Use passed series indexes
s_1.align(s_2, join='right')

(b    1.0
 c    2.0
 d    3.0
 e    NaN
 dtype: float64, b    1
 c    2
 d    3
 e    4
 dtype: int64)

In [43]:
# Get where indexes intersect
s_1.align(s_2, join='inner')

(b    1
 c    2
 d    3
 dtype: int64, b    1
 c    2
 d    3
 dtype: int64)

In [44]:
# You can use align with DFs as well
arr_3 = np.random.randint(10, 50, size=(2, 3))
df_6 = pd.DataFrame(arr_3, ['A', 'B'], ['C', 'D', 'E'])
arr_3 = np.random.randint(10, 50, size=(2, 3))
df_7 = pd.DataFrame(arr_3, ['B', 'C'], ['C', 'D', 'E'])
df_6

Unnamed: 0,C,D,E
A,12,14,45
B,11,25,32


In [45]:
# reindex allows you to align data by index
ser_1.reindex(['c','b','a'])

c    2
b    1
a    0
dtype: int64

In [46]:
# Do the same with DFs
df_6.reindex(['B','A'])

Unnamed: 0,C,D,E
B,11,25,32
A,12,14,45


In [47]:
# You can rename labels
df_6.rename(columns={'C': 'Men', 'D': 'Women', 'E': 'Pets'},
           index={'A': 1, 'B': 2})

Unnamed: 0,Men,Women,Pets
1,12,14,45
2,11,25,32


In [48]:
# Multi-level indexing allows you to store data on multiple dimensions
days = ['Day 1', 'Day 1', 'Day 1', 'Day 2', 'Day 2', 'Day 2']
meals = [1,2,3,1,2,3]
# zip pairs the days and meals arrays 
# Then we create a list of those paired tuples
hier_index = list(zip(days, meals))
print(hier_index)

[('Day 1', 1), ('Day 1', 2), ('Day 1', 3), ('Day 2', 1), ('Day 2', 2), ('Day 2', 3)]


In [49]:
# Converts list of tuples into each row and column
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [50]:
# Generate random array representing calories eaten per meal
arr_5 = np.random.randint(500, 700, size=(6, 2))
df_9 = pd.DataFrame(arr_5, hier_index, ['M', 'F'])
print(df_9)

           M    F
Day 1 1  585  546
      2  645  519
      3  500  603
Day 2 1  617  599
      2  559  580
      3  566  581


In [51]:
# Grab the day 1 DF
df_9.loc['Day 1']

Unnamed: 0,M,F
1,585,546
2,645,519
3,500,603


In [52]:
# Grab calories eaten by the female on day 2 for the 2nd meal
df_9.loc['Day 2'].loc[2]['F']

580

In [53]:
# We can assign names to the Day and Meals Column
df_9.index.names = ['Day', 'Meal']
df_9

Unnamed: 0_level_0,Unnamed: 1_level_0,M,F
Day,Meal,Unnamed: 2_level_1,Unnamed: 3_level_1
Day 1,1,585,546
Day 1,2,645,519
Day 1,3,500,603
Day 2,1,617,599
Day 2,2,559,580
Day 2,3,566,581


In [54]:
dict_4 = {'A': [1,2,np.nan], 'B': [4, np.nan, np.nan], 'C': [7.,8.,9.]}
df_10 = pd.DataFrame(dict_4)
print(df_10)

     A    B    C
0  1.0  4.0  7.0
1  2.0  NaN  8.0
2  NaN  NaN  9.0


In [55]:
# Drop missing data from DF (Drops any row with missing values)
df_10.dropna()

Unnamed: 0,A,B,C
0,1.0,4.0,7.0


In [56]:
# Drop all columns with any missing data
df_10.dropna(axis=1)

Unnamed: 0,C
0,7.0
1,8.0
2,9.0


In [57]:
# Drop row unless it has at least 2 non-NaN values
df_10.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,4.0,7.0
1,2.0,,8.0


In [58]:
# Fill NaN values with 0
df_10.fillna(value=0.0)

Unnamed: 0,A,B,C
0,1.0,4.0,7.0
1,2.0,0.0,8.0
2,0.0,0.0,9.0


In [59]:
# Fill A column with the mean of column
df_10['A'].fillna(value=df_10['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

In [60]:
# Fill A column with the mean of column
df_10['A'].fillna(value=df_10['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

In [61]:
# Fill with next value (Only works if there is a next value)
df_10.fillna(method='bfill')

Unnamed: 0,A,B,C
0,1.0,4.0,7.0
1,2.0,,8.0
2,,,9.0
