In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# To get multiple outputs in the same cell

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline

# A.) Data Input/Output
## 1. Dataframe creation

### There are multiple ways to create a dataframe. 
1. Through lists, which i have demonstrated here.
2. Through dictionaries
3. Using pd.DataFrame()
4. Using pd.from_records()

In [4]:
salary = [['Google', 'Machine Learning Engineer', 121000],
['Google', 'Data Scientist', 109000],
['Google', 'Tech Lead', 129000],
['Facebook', 'Data Scientist', 103000]]

columns_name=['Company', 'Job','Salary']

emp_df = pd.DataFrame(salary,columns=columns_name)
emp_df

Unnamed: 0,Company,Job,Salary
0,Google,Machine Learning Engineer,121000
1,Google,Data Scientist,109000
2,Google,Tech Lead,129000
3,Facebook,Data Scientist,103000


## 2. Import - Creating a dataframe from external file, here csv

In [5]:
cov_df = pd.read_csv('E:\VCS\GitHub\DataScienceAtWork\data\Data USA Cart\covid19.csv')
cov_df.head()

Unnamed: 0,Year,ID Geography,Geography,Date,Confirmed,ConfirmedGrowth,ConfirmedPC,deathIncrease,Deaths,DeathsPC,...,pending,Population,Positive,positiveIncrease,PositivePC,posNeg,recovered,total,totalTestResults,totalTestResultsIncrease
0,2018.0,04000US02,Alaska,2020/03/06,0,,,,0.0,,...,1.0,737438.0,0,,0.0,8,,9,8,
1,2018.0,04000US02,Alaska,2020/03/07,0,,,0.0,0.0,,...,2.0,737438.0,0,0.0,0.0,12,,14,12,4.0
2,2018.0,04000US02,Alaska,2020/03/08,0,,,0.0,0.0,,...,6.0,737438.0,0,0.0,0.0,14,,20,14,2.0
3,2018.0,04000US02,Alaska,2020/03/09,0,,,0.0,0.0,,...,9.0,737438.0,0,0.0,0.0,23,,32,23,9.0
4,2018.0,04000US02,Alaska,2020/03/10,0,,,0.0,0.0,,...,9.0,737438.0,0,0.0,0.0,23,,32,23,0.0


## 3. Export - Writing a dataframe to external file, here csv

In [6]:
#pd.to_csv('path\df.csv')

# B.) Data Operations

## 1. Copying - Creating a new dF from existing dF
1. Copying through variables - Changes made in new dataframe are also reflected in the old dataframe since the new variable 'emp_df_0' is just a pointer to old one 'emp_df'.
2. .copy() - df.copy() creates an independent copy of the dataset

In [7]:
# Copying through variables

emp_df_0 = emp_df
emp_df_0

Unnamed: 0,Company,Job,Salary
0,Google,Machine Learning Engineer,121000
1,Google,Data Scientist,109000
2,Google,Tech Lead,129000
3,Facebook,Data Scientist,103000


In [8]:
# Updating the new dF
emp_df_0['Salary']=emp_df_0['Salary']+1000

# Comparison of dataframes
emp_df_0 == emp_df

# Both the dF have the same values
emp_df_0
emp_df

Unnamed: 0,Company,Job,Salary
0,True,True,True
1,True,True,True
2,True,True,True
3,True,True,True


Unnamed: 0,Company,Job,Salary
0,Google,Machine Learning Engineer,122000
1,Google,Data Scientist,110000
2,Google,Tech Lead,130000
3,Facebook,Data Scientist,104000


Unnamed: 0,Company,Job,Salary
0,Google,Machine Learning Engineer,122000
1,Google,Data Scientist,110000
2,Google,Tech Lead,130000
3,Facebook,Data Scientist,104000


In [9]:
# Using Copy() to create a new dF 
emp_df_1 = emp_df.copy() # df.copy(deep=false) - does not create an independent copy

# Updating the new dF
emp_df_1['Salary']=emp_df_1['Salary']+1000

# Comparison of dataframes
emp_df_1 == emp_df

# Both the dF have the same values
emp_df_1
emp_df

Unnamed: 0,Company,Job,Salary
0,True,True,False
1,True,True,False
2,True,True,False
3,True,True,False


Unnamed: 0,Company,Job,Salary
0,Google,Machine Learning Engineer,123000
1,Google,Data Scientist,111000
2,Google,Tech Lead,131000
3,Facebook,Data Scientist,105000


Unnamed: 0,Company,Job,Salary
0,Google,Machine Learning Engineer,122000
1,Google,Data Scientist,110000
2,Google,Tech Lead,130000
3,Facebook,Data Scientist,104000


###  Analysis of dataFrame Comparison 

In [10]:
# Comparison of dataframes  with == : It does not handle missing ( NaN ) values
# Return False if there are NaN values.
cmp_1 = emp_df_1 == emp_df
cmp_1

# Comparison Analysis
cmp_1.all()
cmp_1.all().sum()

# Comparing 2 dataframes with .eq() : It does not handle missing ( NaN ) values
# Return False if there are NaN values.
emp_df.eq(emp_df_0).all()
emp_df.eq(emp_df_1).all().sum()

# Comparing 2 dataframes with .equals() : It handles missing ( NaN ) values
emp_df.equals(emp_df_0)
emp_df.equals(emp_df_1)

Unnamed: 0,Company,Job,Salary
0,True,True,False
1,True,True,False
2,True,True,False
3,True,True,False


Company     True
Job         True
Salary     False
dtype: bool

2

Company    True
Job        True
Salary     True
dtype: bool

2

True

False

## 2. Creation of new columns - No condition
1. In existing dataFrame
2. In new dataFrame

In [11]:
# Column created in Existing dataFrame

emp_df_1['Hike_amt'] = emp_df_1['Salary']*0.1
emp_df_1

Unnamed: 0,Company,Job,Salary,Hike_amt
0,Google,Machine Learning Engineer,123000,12300.0
1,Google,Data Scientist,111000,11100.0
2,Google,Tech Lead,131000,13100.0
3,Facebook,Data Scientist,105000,10500.0


In [33]:
# Creating a new dataFrame, here using copy()
emp_df_2 = emp_df_1.copy()

# Column created in new dataFrame
# Hike_amt is same for all employess - 15%

emp_df_2['Hike_amt'] = emp_df_1['Salary']*0.15
emp_df_2
emp_df_1


emp_df_2['WFH_Status'] = 1
emp_df_2['Gender'] = 'M'
emp_df_2

Unnamed: 0,Company,Job,Salary,Hike_amt
0,Google,Machine Learning Engineer,123000,18450.0
1,Google,Data Scientist,111000,16650.0
2,Google,Tech Lead,131000,19650.0
3,Facebook,Data Scientist,105000,15750.0


Unnamed: 0,Company,Job,Salary,Hike_amt
0,Google,Machine Learning Engineer,123000,12300.0
1,Google,Data Scientist,111000,11100.0
2,Google,Tech Lead,131000,13100.0
3,Facebook,Data Scientist,105000,10500.0


Unnamed: 0,Company,Job,Salary,Hike_amt,WFH_Status,Gender
0,Google,Machine Learning Engineer,123000,18450.0,1,M
1,Google,Data Scientist,111000,16650.0,1,M
2,Google,Tech Lead,131000,19650.0,1,M
3,Facebook,Data Scientist,105000,15750.0,1,M


## 3. Creation of new columns - Based on If-then/else condition

In [34]:
# Bonus_amt is decided based on Job Role
# DS - 20%
# MLE - 15%
# TL - 10%

def bonus(row):
    if row['Job'] == 'Data Scientist':
        return row['Salary']*0.2
    elif row['Job'] == 'Machine Learning Engineer':
        return row['Salary']*0.15
    elif row['Job'] == 'Tech Lead':
        return row['Salary']*0.1

emp_df_2a =  emp_df_2.assign(bonus_amt=emp_df_2.apply(bonus,axis=1))
emp_df_2a =  emp_df_2a.assign(Bonus_amt=emp_df_2.apply(bonus,axis=1))

emp_df_2a

Unnamed: 0,Company,Job,Salary,Hike_amt,WFH_Status,Gender,bonus_amt,Bonus_amt
0,Google,Machine Learning Engineer,123000,18450.0,1,M,18450.0,18450.0
1,Google,Data Scientist,111000,16650.0,1,M,22200.0,22200.0
2,Google,Tech Lead,131000,19650.0,1,M,13100.0,13100.0
3,Facebook,Data Scientist,105000,15750.0,1,M,21000.0,21000.0


In [39]:
emp_df_2b = emp_df_2.copy()

emp_df_2b.loc[emp_df_2b['Job'] == 'Data Scientist', 'Bonus_amt' ] = emp_df_2b['Salary']*0.2
emp_df_2b
emp_df_2b.loc[emp_df_2b['Job'] == 'Machine Learning Engineer', 'Bonus_amt' ] = emp_df_2b['Salary']*0.15
emp_df_2b
emp_df_2b.loc[emp_df_2b['Job'] == 'Tech Lead', 'Bonus_amt' ] = emp_df_2b['Salary']*0.1
emp_df_2b

Unnamed: 0,Company,Job,Salary,Hike_amt,WFH_Status,Gender,Bonus_amt
0,Google,Machine Learning Engineer,123000,18450.0,1,M,
1,Google,Data Scientist,111000,16650.0,1,M,22200.0
2,Google,Tech Lead,131000,19650.0,1,M,
3,Facebook,Data Scientist,105000,15750.0,1,M,21000.0


Unnamed: 0,Company,Job,Salary,Hike_amt,WFH_Status,Gender,Bonus_amt
0,Google,Machine Learning Engineer,123000,18450.0,1,M,18450.0
1,Google,Data Scientist,111000,16650.0,1,M,22200.0
2,Google,Tech Lead,131000,19650.0,1,M,
3,Facebook,Data Scientist,105000,15750.0,1,M,21000.0


Unnamed: 0,Company,Job,Salary,Hike_amt,WFH_Status,Gender,Bonus_amt
0,Google,Machine Learning Engineer,123000,18450.0,1,M,18450.0
1,Google,Data Scientist,111000,16650.0,1,M,22200.0
2,Google,Tech Lead,131000,19650.0,1,M,13100.0
3,Facebook,Data Scientist,105000,15750.0,1,M,21000.0


In [40]:
# Updating the column on which the condition is based

emp_df_2b.loc[emp_df_2b.Bonus_amt >= 20000, 'Bonus_amt'] = emp_df_2b.Bonus_amt-100
emp_df_2b

Unnamed: 0,Company,Job,Salary,Hike_amt,WFH_Status,Gender,Bonus_amt
0,Google,Machine Learning Engineer,123000,18450.0,1,M,18450.0
1,Google,Data Scientist,111000,16650.0,1,M,22100.0
2,Google,Tech Lead,131000,19650.0,1,M,13100.0
3,Facebook,Data Scientist,105000,15750.0,1,M,20900.0
