In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('house.csv')

In [3]:
df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,yr_built,zipcode,lat,long,price
0,3,1.0,1180,5650,1955,98178,47.5112,-122.257,221900
1,3,2.25,2570,7242,1951,98125,47.721,-122.319,538000
2,2,1.0,770,10000,1933,98028,47.7379,-122.233,180000
3,4,3.0,1960,5000,1965,98136,47.5208,-122.393,604000
4,3,2.0,1680,8080,1987,98074,47.6168,-122.045,510000


### Basic Info

In [4]:
df.shape #rows, columns

(6007, 9)

In [5]:
df.columns #column headings

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'yr_built',
       'zipcode', 'lat', 'long', 'price'],
      dtype='object')

In [6]:
df.index #rows

RangeIndex(start=0, stop=6007, step=1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6007 entries, 0 to 6006
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   bedrooms     6007 non-null   int64  
 1   bathrooms    6007 non-null   float64
 2   sqft_living  6007 non-null   int64  
 3   sqft_lot     6007 non-null   int64  
 4   yr_built     6007 non-null   int64  
 5   zipcode      6007 non-null   int64  
 6   lat          6007 non-null   float64
 7   long         6007 non-null   float64
 8   price        6007 non-null   int64  
dtypes: float64(3), int64(6)
memory usage: 422.5 KB


In [8]:
df.describe()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,yr_built,zipcode,lat,long,price
count,6007.0,6007.0,6007.0,6007.0,6007.0,6007.0,6007.0,6007.0,6007.0
mean,3.376228,2.067088,2064.112202,15975.09,1966.936075,98077.920926,47.560244,-122.214033,537871.7
std,0.896744,0.763731,914.197376,43912.24,28.250012,53.955535,0.139884,0.140329,377463.7
min,1.0,0.5,380.0,572.0,1900.0,98001.0,47.1559,-122.514,80000.0
25%,3.0,1.5,1410.0,5412.5,1949.0,98033.0,47.4629,-122.327,319450.0
50%,3.0,2.0,1910.0,7912.0,1969.0,98065.0,47.5742,-122.229,450000.0
75%,4.0,2.5,2510.0,11227.5,1990.0,98118.0,47.67945,-122.1265,650000.0
max,9.0,6.75,10040.0,1651359.0,2015.0,98199.0,47.7776,-121.315,7060000.0


### Slice

In [9]:
df.iloc[:5,1:4] # row 0-4, col 1-3

Unnamed: 0,bathrooms,sqft_living,sqft_lot
0,1.0,1180,5650
1,2.25,2570,7242
2,1.0,770,10000
3,3.0,1960,5000
4,2.0,1680,8080


In [10]:
df.iloc[:5:2,1:4] # row 0-4, col 1-3, every second row, [start:end:stride]

Unnamed: 0,bathrooms,sqft_living,sqft_lot
0,1.0,1180,5650
2,1.0,770,10000
4,2.0,1680,8080


In [11]:
df.loc[:,'bathrooms':'zipcode'] # using col names, all rows with cols from bathrooms to zipcode

Unnamed: 0,bathrooms,sqft_living,sqft_lot,yr_built,zipcode
0,1.00,1180,5650,1955,98178
1,2.25,2570,7242,1951,98125
2,1.00,770,10000,1933,98028
3,3.00,1960,5000,1965,98136
4,2.00,1680,8080,1987,98074
...,...,...,...,...,...
6002,1.00,1450,9586,1950,98155
6003,2.25,2870,6280,1905,98112
6004,2.00,1870,13970,1969,98027
6005,3.50,2490,18042,2003,98070


In [12]:
beds = df.loc[:,'bedrooms']

#### Broadcast

In [13]:
beds.iloc[:10:3] = np.nan #assigning nan to every 3rd row between 0 and 9

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [14]:
beds.head(15)

0     NaN
1     3.0
2     2.0
3     NaN
4     3.0
5     4.0
6     NaN
7     3.0
8     3.0
9     NaN
10    3.0
11    2.0
12    3.0
13    3.0
14    5.0
Name: bedrooms, dtype: float64

In [15]:
df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,yr_built,zipcode,lat,long,price
0,,1.0,1180,5650,1955,98178,47.5112,-122.257,221900
1,3.0,2.25,2570,7242,1951,98125,47.721,-122.319,538000
2,2.0,1.0,770,10000,1933,98028,47.7379,-122.233,180000
3,,3.0,1960,5000,1965,98136,47.5208,-122.393,604000
4,3.0,2.0,1680,8080,1987,98074,47.6168,-122.045,510000


#### Building df from lists

In [16]:
bath = df.loc[:,'bathrooms']

In [17]:
columns = [beds, bath] # column data as list

In [18]:
labels = ['bedrooms', 'bathrooms'] # column headings

In [19]:
zipped = list(zip(labels,columns)) #zip together

In [20]:
zipped

[('bedrooms',
  0       NaN
  1       3.0
  2       2.0
  3       NaN
  4       3.0
         ... 
  6002    3.0
  6003    4.0
  6004    5.0
  6005    4.0
  6006    4.0
  Name: bedrooms, Length: 6007, dtype: float64),
 ('bathrooms',
  0       1.00
  1       2.25
  2       1.00
  3       3.00
  4       2.00
          ... 
  6002    1.00
  6003    2.25
  6004    2.00
  6005    3.50
  6006    2.50
  Name: bathrooms, Length: 6007, dtype: float64)]

In [21]:
bedbath = dict(zipped) #create a dictionary

In [22]:
bedbath

{'bedrooms': 0       NaN
 1       3.0
 2       2.0
 3       NaN
 4       3.0
        ... 
 6002    3.0
 6003    4.0
 6004    5.0
 6005    4.0
 6006    4.0
 Name: bedrooms, Length: 6007, dtype: float64,
 'bathrooms': 0       1.00
 1       2.25
 2       1.00
 3       3.00
 4       2.00
         ... 
 6002    1.00
 6003    2.25
 6004    2.00
 6005    3.50
 6006    2.50
 Name: bathrooms, Length: 6007, dtype: float64}

In [23]:
df_bedbath = pd.DataFrame(bedbath) # create a dataframe

In [24]:
df_bedbath

Unnamed: 0,bedrooms,bathrooms
0,,1.00
1,3.0,2.25
2,2.0,1.00
3,,3.00
4,3.0,2.00
...,...,...
6002,3.0,1.00
6003,4.0,2.25
6004,5.0,2.00
6005,4.0,3.50


In [25]:
df_bedbath['zeroes'] = 0 #creating new column, filling with zeroes

In [26]:
df_bedbath.head()

Unnamed: 0,bedrooms,bathrooms,zeroes
0,,1.0,0
1,3.0,2.25,0
2,2.0,1.0,0
3,,3.0,0
4,3.0,2.0,0


In [27]:
df_bedbath.drop('zeroes', axis=1, inplace = True) #axis = 1 for columns, can also use columns=, or index= (for rows)

In [28]:
df_bedbath.head()

Unnamed: 0,bedrooms,bathrooms
0,,1.0
1,3.0,2.25
2,2.0,1.0
3,,3.0
4,3.0,2.0


### String 

In [29]:
names = {
    'Name' : ['John Smith', 'Eric Garcia', 'Monique Johnson'],
    'Age' : [41, 27, 59]
}

In [30]:
df_names = pd.DataFrame(names)

In [31]:
df_names

Unnamed: 0,Name,Age
0,John Smith,41
1,Eric Garcia,27
2,Monique Johnson,59


In [32]:
df_names.Name.str.upper() #upper case

0         JOHN SMITH
1        ERIC GARCIA
2    MONIQUE JOHNSON
Name: Name, dtype: object

In [33]:
df_names.Name.str.split(' ') #split on space

0         [John, Smith]
1        [Eric, Garcia]
2    [Monique, Johnson]
Name: Name, dtype: object

In [34]:
df_names.Name.str.strip('on') #remove string characters (also lstrip, rstrip)

0       John Smith
1      Eric Garcia
2    Monique Johns
Name: Name, dtype: object

In [35]:
df_names.Name.str.contains('Eric') #True if string contains value

0    False
1     True
2    False
Name: Name, dtype: bool

### Sorting

In [36]:
df.sort_values('bedrooms', ascending =  False)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,yr_built,zipcode,lat,long,price
4231,9.0,3.00,3680,4400,1908,98102,47.6374,-122.324,700000
4092,9.0,4.50,3830,6988,1938,98103,47.6927,-122.338,599999
4031,8.0,6.00,4340,9415,1967,98004,47.6316,-122.202,2150000
1658,8.0,5.00,2800,2580,1997,98122,47.6086,-122.303,490000
4063,8.0,3.00,2850,12714,1959,98055,47.4859,-122.205,373000
...,...,...,...,...,...,...,...,...,...
3220,1.0,0.75,520,12981,1920,98022,47.2082,-121.995,262000
0,,1.00,1180,5650,1955,98178,47.5112,-122.257,221900
3,,3.00,1960,5000,1965,98136,47.5208,-122.393,604000
6,,2.25,1715,6819,1995,98003,47.3097,-122.327,257500


In [52]:
df['bedrooms'].value_counts(sort=True)

3.0    2769
4.0    1913
2.0     732
5.0     429
6.0      91
1.0      56
7.0       8
8.0       3
9.0       2
Name: bedrooms, dtype: int64

In [38]:
subset = df[['zipcode', 'price']]

In [39]:
subset.head()

Unnamed: 0,zipcode,price
0,98178,221900
1,98125,538000
2,98028,180000
3,98136,604000
4,98074,510000


In [40]:
subset['price'] > 1000000 #True if greater than 1M

0       False
1       False
2       False
3       False
4       False
        ...  
6002    False
6003     True
6004    False
6005    False
6006    False
Name: price, Length: 6007, dtype: bool

In [41]:
subset[subset['price'] > 1000000] #apply T/F condition to df

Unnamed: 0,zipcode,price
5,98053,1230000
21,98040,2000000
49,98070,1350000
69,98004,1330000
70,98005,1040000
...,...,...
5952,98040,1560000
5955,98053,1110000
5961,98144,2730000
5983,98116,1050000


In [44]:
condition1 = subset['price'] > 1000000
condition2 = subset['zipcode'] == 98040

In [51]:
df[condition1 & condition2 & (df['yr_built'] > 1999)] # multi conditions. using from subset and df

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,yr_built,zipcode,lat,long,price
1679,4.0,2.5,3040,7000,2001,98040,47.5934,-122.244,1230000
1728,5.0,4.25,4830,11466,2014,98040,47.5774,-122.222,1680000
2459,5.0,5.0,4930,14649,2000,98040,47.5829,-122.247,1700000
2788,5.0,3.75,3530,13260,2013,98040,47.5761,-122.205,1610000
3037,4.0,4.25,4070,13860,2004,98040,47.59,-122.229,2300000
3867,5.0,4.75,6240,47480,2003,98040,47.5317,-122.233,2950000
5824,4.0,3.25,4060,13000,2000,98040,47.581,-122.246,1800000


### Aggregates