# Five Simple Pandas Tricks

In [1]:
import pandas as pd
import numpy as np

### Unique Values in Each Column

In [2]:
df = pd.read_csv('house.csv')

In [3]:
# number of unique values in each column
for column in df:
    print(column, df[column].nunique())

bedrooms 9
bathrooms 24
sqft_living 568
sqft_lot 3778
yr_built 116
zipcode 70
lat 3372
long 623
price 1764


### Distance Formula

This formula is an equirectangular distance approximation for the shortest distance between two points given their longitudes and latitudes.

In [4]:
# calculating distance
# Seattle = 47.608013, -122.335167
# approx one degree of latitude ellipsoidal earth at 47N: 69.08 miles

def distance(Lat2, Long2):
    Lat1 = 47.608013
    Long1 = -122.335167
    x = Lat2 - Lat1
    y = (Long2 - Long1) * np.cos((Lat2 + Lat1)*(0.5 * np.pi/180))  
    return 69.08 * np.sqrt(x*x + y*y)

distance(47.5112, -122.257)

7.616113362022828

In [5]:
df['Miles_Seattle'] = df.apply(lambda x: distance(x['lat'], x['long']), axis=1)
df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,yr_built,zipcode,lat,long,price,Miles_Seattle
0,3,1.0,1180,5650,1955,98178,47.5112,-122.257,221900,7.616113
1,3,2.25,2570,7242,1951,98125,47.721,-122.319,538000,7.841298
2,2,1.0,770,10000,1933,98028,47.7379,-122.233,180000,10.153451
3,4,3.0,1960,5000,1965,98136,47.5208,-122.393,604000,6.600281
4,3,2.0,1680,8080,1987,98074,47.6168,-122.045,510000,13.526635


### Height in Inches

In [6]:
df_height_weight = pd.read_csv('height_weight.csv')

In [7]:
df_height_weight.head()

Unnamed: 0,Height,Weight
0,5-9,251
1,5-9,260
2,6-8,153
3,5-6,283
4,6-0,241


In [8]:
df_height_weight['Height_in'] = df_height_weight['Height'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))

In [9]:
df_height_weight.head()

Unnamed: 0,Height,Weight,Height_in
0,5-9,251,69
1,5-9,260,69
2,6-8,153,80
3,5-6,283,66
4,6-0,241,72


### Finding Year in a Column

In [10]:
df_title = pd.read_csv('titles.csv')

In [11]:
df_title.head()

Unnamed: 0,Title
0,A Scanner Darkly - 2006
1,Last Vegas (2013)
2,Baraka 1992
3,Slumdog Millionaire (2008)
4,The Motorcycle Diaries [2004]


In [12]:
year_digits = '(\d{4})'
df_title['year'] = df_title['Title'].str.extract(year_digits, expand=False)

In [13]:
df_title.head()

Unnamed: 0,Title,year
0,A Scanner Darkly - 2006,2006
1,Last Vegas (2013),2013
2,Baraka 1992,1992
3,Slumdog Millionaire (2008),2008
4,The Motorcycle Diaries [2004],2004


### Filling Missing Values

In [14]:
df_subs = pd.read_csv('subs.csv')

In [15]:
df_subs.head()

Unnamed: 0,Date,Disney+
0,6/27/2020,57500000.0
1,7/4/2020,
2,7/11/2020,
3,7/18/2020,
4,7/25/2020,


In [16]:
# convert to datetime
df_subs['Date'] = pd.to_datetime(df_subs['Date'])

In [17]:
# fill missing data using linear interpolation, limit to inside existing values
df_subs = df_subs.interpolate(method ='linear', limit_direction ='both', limit_area='inside') 

In [19]:
df_subs.head()

Unnamed: 0,Date,Disney+
0,2020-06-27,57500000.0
1,2020-07-04,58100000.0
2,2020-07-11,58700000.0
3,2020-07-18,59300000.0
4,2020-07-25,59900000.0


In [18]:
df_subs.tail()

Unnamed: 0,Date,Disney+
22,2020-11-28,86800000.0
23,2020-12-05,88256250.0
24,2020-12-12,89712500.0
25,2020-12-19,91168750.0
26,2020-12-26,92625000.0
