# LinkedIn Learning
# Advanced Pandas

## 1. From Beginner to Advanced Pandas

### A. Intro to Dataframes using Pandas

In [1]:
import pandas as pd

In [2]:
scores = {
    "name" : ['Ray','Japhy','Zosa'],
    "city" : ['San Francisco', 'San Francisco', 'Denver'],
    "score" : [75,92,94]
}

In [3]:
df = pd.DataFrame(scores)

In [4]:
df

Unnamed: 0,name,city,score
0,Ray,San Francisco,75
1,Japhy,San Francisco,92
2,Zosa,Denver,94


In [6]:
df['score']

0    75
1    92
2    94
Name: score, dtype: int64

In [7]:
df['name_city'] = df['name'] + '_' + df['city']

In [8]:
df

Unnamed: 0,name,city,score,name_city
0,Ray,San Francisco,75,Ray_San Francisco
1,Japhy,San Francisco,92,Japhy_San Francisco
2,Zosa,Denver,94,Zosa_Denver


In [10]:
df[df['score'] > 90]

Unnamed: 0,name,city,score,name_city
1,Japhy,San Francisco,92,Japhy_San Francisco
2,Zosa,Denver,94,Zosa_Denver


### B. Top Functions using Pandas

In [12]:
iris = pd.read_csv('iris.csv')

In [13]:
iris.shape

(150, 5)

In [14]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [15]:
iris.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [16]:
iris.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [17]:
iris.loc[3:5]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa


In [18]:
iris.loc[3, 'sepal_length']

4.6

In [19]:
iris.iloc[3,0]

4.6

In [20]:
iris.to_csv('iris-output.csv', index = False)

### C. Configuring options using Pandas

In [26]:
emissions = pd.DataFrame({
    "country" : ['China', 'United States', 'India'],
    "year" : ['2018', '2018', '2018'],
    "co2_emissions" : [10060000000.0, 5410000000.0, 2650000000.0]
})

In [27]:
emissions

Unnamed: 0,country,year,co2_emissions
0,China,2018,10060000000.0
1,United States,2018,5410000000.0
2,India,2018,2650000000.0


In [32]:
pd.options.display.float_format = '{:,.2f}'.format

In [33]:
emissions

Unnamed: 0,country,year,co2_emissions
0,China,2018,10060000000.0
1,United States,2018,5410000000.0
2,India,2018,2650000000.0


## 2. Advanced Calculations

### A. Data type conversions using pandas

In [34]:
planets = pd.read_csv('planets.csv')

In [35]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.77,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [36]:
planets.tail()

Unnamed: 0,method,number,orbital_period,mass,distance,year
1030,Transit,1,3.94,,172.0,2006
1031,Transit,1,2.62,,148.0,2007
1032,Transit,1,3.19,,174.0,2007
1033,Transit,1,4.13,,293.0,2008
1034,Transit,1,4.19,,260.0,2008


In [37]:
planets.dtypes

method             object
number              int64
orbital_period    float64
mass              float64
distance          float64
year                int64
dtype: object

In [40]:
planets.describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,1035.0,992.0,513.0,808.0,1035.0
mean,1.79,2002.92,2.64,264.07,2009.07
std,1.24,26014.73,3.82,733.12,3.97
min,1.0,0.09,0.0,1.35,1989.0
25%,1.0,5.44,0.23,32.56,2007.0
50%,1.0,39.98,1.26,55.25,2010.0
75%,2.0,526.0,3.04,178.5,2012.0
max,7.0,730000.0,25.0,8500.0,2014.0


In [41]:
planets['number'][0] / planets['mass'][0] 

0.14084507042253522

In [42]:
planets['number'][0].astype(float)

1.0

In [43]:
planets['mass'][0] .astype(int)

7

In [44]:
planets['year'][0].astype(str)

'2006'

In [45]:
planets['year_dt'] = pd.to_datetime(planets['year'], format="%Y")
planets['year_dt']

0      2006-01-01
1      2008-01-01
2      2011-01-01
3      2007-01-01
4      2009-01-01
          ...    
1030   2006-01-01
1031   2007-01-01
1032   2007-01-01
1033   2008-01-01
1034   2008-01-01
Name: year_dt, Length: 1035, dtype: datetime64[ns]

### B. Working with strings using pandas

In [46]:
names = pd.Series(['Pomray, CODY ',' Wagner: Jarry','smith, Ray'])

In [47]:
names = names.str.replace(';',',')
names

0     Pomray, CODY 
1     Wagner: Jarry
2        smith, Ray
dtype: object

In [48]:
names.str.len()

0    13
1    14
2    10
dtype: int64

In [49]:
names = names.str.strip()
names.str.len()

0    12
1    13
2    10
dtype: int64

In [50]:
names = names.str.lower()
names

0     pomray, cody
1    wagner: jarry
2       smith, ray
dtype: object

In [51]:
names = names.str.split(', ')
names

0     [pomray, cody]
1    [wagner: jarry]
2       [smith, ray]
dtype: object

In [52]:
names = pd.Series([i[::-1] for i in names])
names

0     [cody, pomray]
1    [wagner: jarry]
2       [ray, smith]
dtype: object

In [53]:
names = [' '.join(i) for i in names]
names

['cody pomray', 'wagner: jarry', 'ray smith']

### C: Working with dates using pandas

In [65]:
daterange = pd.period_range('1/1/2025', freq = '30d', periods = 4)

In [66]:
date_df = pd.DataFrame(data = daterange, columns = ['sample date'])
date_df

Unnamed: 0,sample date
0,2025-01-01
1,2025-01-31
2,2025-03-02
3,2025-04-01


In [67]:
date_df['date difference'] = date_df['sample date'].diff(periods = 1)
date_df

  new_data = np.array([self.freq.base * x for x in new_i8_data])


Unnamed: 0,sample date,date difference
0,2025-01-01,NaT
1,2025-01-31,<30 * Days>
2,2025-03-02,<30 * Days>
3,2025-04-01,<30 * Days>


In [68]:
date_df['First of Month'] = date_df['sample date'].values.astype('datetime64[M]')
date_df

Unnamed: 0,sample date,date difference,First of Month
0,2025-01-01,NaT,2025-01-01
1,2025-01-31,<30 * Days>,2025-01-01
2,2025-03-02,<30 * Days>,2025-03-01
3,2025-04-01,<30 * Days>,2025-04-01


In [69]:
date_df.dtypes

sample date          period[30D]
date difference           object
First of Month     datetime64[s]
dtype: object

In [70]:
date_df['sample date'] = date_df['sample date'].dt.to_timestamp()

In [71]:
date_df.dtypes

sample date        datetime64[ns]
date difference            object
First of Month      datetime64[s]
dtype: object

In [72]:
date_df['sample date'] - date_df['First of Month']

0    0 days
1   30 days
2    1 days
3    0 days
dtype: timedelta64[ns]

In [73]:
date_df['sample date'] - date_df['date difference']

  date_df['sample date'] - date_df['date difference']


0                    NaT
1    2025-01-01 00:00:00
2    2025-01-31 00:00:00
3    2025-03-02 00:00:00
dtype: object

In [74]:
date_df['sample date'] - pd.Timedelta('30 d')

0   2024-12-02
1   2025-01-01
2   2025-01-31
3   2025-03-02
Name: sample date, dtype: datetime64[ns]

In [75]:
date_df['sample date'].dt.day_name()

0    Wednesday
1       Friday
2       Sunday
3      Tuesday
Name: sample date, dtype: object

### D. Dealing with missing data using Pandas

In [76]:
temps = pd.DataFrame({
    "sequence" : [1,2,3,4,5],
    "measurement_type" : ['actual','actual','actual',None,'estimated'],
    "temperature_f" : [67.24, 84.56, 91.61, None, 49.64]
})
temps

Unnamed: 0,sequence,measurement_type,temperature_f
0,1,actual,67.24
1,2,actual,84.56
2,3,actual,91.61
3,4,,
4,5,estimated,49.64


In [77]:
temps.isna()

Unnamed: 0,sequence,measurement_type,temperature_f
0,False,False,False
1,False,False,False
2,False,False,False
3,False,True,True
4,False,False,False


In [78]:
temps['temperature_f'].cumsum()

0    67.24
1   151.80
2   243.41
3      NaN
4   293.05
Name: temperature_f, dtype: float64

In [79]:
temps['temperature_f'].cumsum(skipna = False)

0    67.24
1   151.80
2   243.41
3      NaN
4      NaN
Name: temperature_f, dtype: float64

In [80]:
temps.groupby(by=['measurement_type']).max()

Unnamed: 0_level_0,sequence,temperature_f
measurement_type,Unnamed: 1_level_1,Unnamed: 2_level_1
actual,3,91.61
estimated,5,49.64


In [83]:
temps.groupby(by=['measurement_type'], dropna = False).max()

Unnamed: 0_level_0,sequence,temperature_f
measurement_type,Unnamed: 1_level_1,Unnamed: 2_level_1
actual,3,91.61
estimated,5,49.64
,4,


In [81]:
temps.dropna()

Unnamed: 0,sequence,measurement_type,temperature_f
0,1,actual,67.24
1,2,actual,84.56
2,3,actual,91.61
4,5,estimated,49.64


In [82]:
temps.dropna(axis=1)

Unnamed: 0,sequence
0,1
1,2
2,3
3,4
4,5


In [84]:
temps.dropna()

Unnamed: 0,sequence,measurement_type,temperature_f
0,1,actual,67.24
1,2,actual,84.56
2,3,actual,91.61
4,5,estimated,49.64


In [85]:
temps.dropna(axis=1)

Unnamed: 0,sequence
0,1
1,2
2,3
3,4
4,5


In [86]:
temps.fillna(0)

Unnamed: 0,sequence,measurement_type,temperature_f
0,1,actual,67.24
1,2,actual,84.56
2,3,actual,91.61
3,4,0,0.0
4,5,estimated,49.64


In [87]:
temps.fillna(method = 'pad')

  temps.fillna(method = 'pad')


Unnamed: 0,sequence,measurement_type,temperature_f
0,1,actual,67.24
1,2,actual,84.56
2,3,actual,91.61
3,4,actual,91.61
4,5,estimated,49.64


In [89]:
temps.interpolate()

  temps.interpolate()


Unnamed: 0,sequence,measurement_type,temperature_f
0,1,actual,67.24
1,2,actual,84.56
2,3,actual,91.61
3,4,,70.62
4,5,estimated,49.64


### E. Apply/Map/Applymap

In [90]:
df = pd.DataFrame({
    "Region" : ['North', 'West', 'East', 'South', 'North', 'West', 'East', 'South'],
    "Team" : ['One','One','One','One','Two','Two','Two','Two'],
    "Squad" : ['A','B','C','D','E','F','G','H'],
    "Revenue" : [7500, 5500, 2750, 6400, 2300, 3750, 1900, 575],
    "Cost" : [5200, 5100, 4400, 5300, 1250, 1300, 2100, 50]
})

In [91]:
df['Profit'] = df.apply(lambda x: 'Profit' if x['Revenue']>x['Cost'] else 'Loss', axis = 1)
df

Unnamed: 0,Region,Team,Squad,Revenue,Cost,Profit
0,North,One,A,7500,5200,Profit
1,West,One,B,5500,5100,Profit
2,East,One,C,2750,4400,Loss
3,South,One,D,6400,5300,Profit
4,North,Two,E,2300,1250,Profit
5,West,Two,F,3750,1300,Profit
6,East,Two,G,1900,2100,Loss
7,South,Two,H,575,50,Profit


In [92]:
team_map = {"One" : "Red", "Two" : "Blue"}

In [93]:
df['Team Color'] = df['Team'].map(team_map)
df

Unnamed: 0,Region,Team,Squad,Revenue,Cost,Profit,Team Color
0,North,One,A,7500,5200,Profit,Red
1,West,One,B,5500,5100,Profit,Red
2,East,One,C,2750,4400,Loss,Red
3,South,One,D,6400,5300,Profit,Red
4,North,Two,E,2300,1250,Profit,Blue
5,West,Two,F,3750,1300,Profit,Blue
6,East,Two,G,1900,2100,Loss,Blue
7,South,Two,H,575,50,Profit,Blue


In [94]:
df.applymap(lambda x: len(str(x)))

  df.applymap(lambda x: len(str(x)))


Unnamed: 0,Region,Team,Squad,Revenue,Cost,Profit,Team Color
0,5,3,1,4,4,6,3
1,4,3,1,4,4,6,3
2,4,3,1,4,4,4,3
3,5,3,1,4,4,6,3
4,5,3,1,4,4,6,4
5,4,3,1,4,4,6,4
6,4,3,1,4,4,4,4
7,5,3,1,3,2,6,4


In [95]:
new_col = []

for i in range(0, len(df)):
    rev = df['Revenue'][i]/df[df['Region'] == df.loc[i,'Region']]['Revenue'].sum()
    new_col.append(rev)

In [96]:
df['Revenue Share of Region'] = new_col

In [97]:
df.sort_values(by = 'Region')

Unnamed: 0,Region,Team,Squad,Revenue,Cost,Profit,Team Color,Revenue Share of Region
2,East,One,C,2750,4400,Loss,Red,0.59
6,East,Two,G,1900,2100,Loss,Blue,0.41
0,North,One,A,7500,5200,Profit,Red,0.77
4,North,Two,E,2300,1250,Profit,Blue,0.23
3,South,One,D,6400,5300,Profit,Red,0.92
7,South,Two,H,575,50,Profit,Blue,0.08
1,West,One,B,5500,5100,Profit,Red,0.59
5,West,Two,F,3750,1300,Profit,Blue,0.41
