In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('house.csv')

In [16]:
df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,yr_built,zipcode,lat,long,price
0,3,1.0,1180,5650,1955,98178,47.5112,-122.257,221900
1,3,2.25,2570,7242,1951,98125,47.721,-122.319,538000
2,2,1.0,770,10000,1933,98028,47.7379,-122.233,180000
3,4,3.0,1960,5000,1965,98136,47.5208,-122.393,604000
4,3,2.0,1680,8080,1987,98074,47.6168,-122.045,510000


### Basic Info

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6007 entries, 0 to 6006
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   bedrooms     6007 non-null   int64  
 1   bathrooms    6007 non-null   float64
 2   sqft_living  6007 non-null   int64  
 3   sqft_lot     6007 non-null   int64  
 4   yr_built     6007 non-null   int64  
 5   zipcode      6007 non-null   int64  
 6   lat          6007 non-null   float64
 7   long         6007 non-null   float64
 8   price        6007 non-null   int64  
dtypes: float64(3), int64(6)
memory usage: 422.5 KB


In [4]:
df.describe()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,yr_built,zipcode,lat,long,price
count,6007.0,6007.0,6007.0,6007.0,6007.0,6007.0,6007.0,6007.0,6007.0
mean,3.376228,2.067088,2064.112202,15975.09,1966.936075,98077.920926,47.560244,-122.214033,537871.7
std,0.896744,0.763731,914.197376,43912.24,28.250012,53.955535,0.139884,0.140329,377463.7
min,1.0,0.5,380.0,572.0,1900.0,98001.0,47.1559,-122.514,80000.0
25%,3.0,1.5,1410.0,5412.5,1949.0,98033.0,47.4629,-122.327,319450.0
50%,3.0,2.0,1910.0,7912.0,1969.0,98065.0,47.5742,-122.229,450000.0
75%,4.0,2.5,2510.0,11227.5,1990.0,98118.0,47.67945,-122.1265,650000.0
max,9.0,6.75,10040.0,1651359.0,2015.0,98199.0,47.7776,-121.315,7060000.0


### Slice

In [10]:
df.iloc[:5,1:4] # row 0-4, col 1-3

Unnamed: 0,bathrooms,sqft_living,sqft_lot
0,1.0,1180,5650
1,2.25,2570,7242
2,1.0,770,10000
3,3.0,1960,5000
4,2.0,1680,8080


In [19]:
df.iloc[:5:2,1:4] # row 0-4, col 1-3, every second row, [start:end:stride]

Unnamed: 0,bathrooms,sqft_living,sqft_lot
0,1.0,1180,5650
2,1.0,770,10000
4,2.0,1680,8080


In [23]:
df.loc[:,'bathrooms':'zipcode'] # using col names, all rows with cols from bathrooms to zipcode

Unnamed: 0,bathrooms,sqft_living,sqft_lot,yr_built,zipcode
0,1.00,1180,5650,1955,98178
1,2.25,2570,7242,1951,98125
2,1.00,770,10000,1933,98028
3,3.00,1960,5000,1965,98136
4,2.00,1680,8080,1987,98074
...,...,...,...,...,...
6002,1.00,1450,9586,1950,98155
6003,2.25,2870,6280,1905,98112
6004,2.00,1870,13970,1969,98027
6005,3.50,2490,18042,2003,98070


In [30]:
beds = df.loc[:,'bedrooms']

#### Broadcast

In [31]:
beds.iloc[:10:3] = np.nan #assigning nan to every 3rd row between 0 and 9

In [33]:
beds.head(15)

0     NaN
1     3.0
2     2.0
3     NaN
4     3.0
5     4.0
6     NaN
7     3.0
8     3.0
9     NaN
10    3.0
11    2.0
12    3.0
13    3.0
14    5.0
Name: bedrooms, dtype: float64

#### Building df from lists

In [35]:
bath = df.loc[:,'bathrooms']

In [36]:
columns = [beds, bath] # column data as list

In [37]:
labels = ['bedrooms', 'bathrooms'] # column headings

In [38]:
zipped = list(zip(labels,columns)) #zip together

In [43]:
zipped

[('bedrooms',
  0       NaN
  1       3.0
  2       2.0
  3       NaN
  4       3.0
         ... 
  6002    3.0
  6003    4.0
  6004    5.0
  6005    4.0
  6006    4.0
  Name: bedrooms, Length: 6007, dtype: float64),
 ('bathrooms',
  0       1.00
  1       2.25
  2       1.00
  3       3.00
  4       2.00
          ... 
  6002    1.00
  6003    2.25
  6004    2.00
  6005    3.50
  6006    2.50
  Name: bathrooms, Length: 6007, dtype: float64)]

In [39]:
bedbath = dict(zipped) #create a dictionary

In [40]:
bedbath

{'bedrooms': 0       NaN
 1       3.0
 2       2.0
 3       NaN
 4       3.0
        ... 
 6002    3.0
 6003    4.0
 6004    5.0
 6005    4.0
 6006    4.0
 Name: bedrooms, Length: 6007, dtype: float64,
 'bathrooms': 0       1.00
 1       2.25
 2       1.00
 3       3.00
 4       2.00
         ... 
 6002    1.00
 6003    2.25
 6004    2.00
 6005    3.50
 6006    2.50
 Name: bathrooms, Length: 6007, dtype: float64}

In [41]:
df_bedbath = pd.DataFrame(bedbath) # create a dataframe

In [42]:
df_bedbath

Unnamed: 0,bedrooms,bathrooms
0,,1.00
1,3.0,2.25
2,2.0,1.00
3,,3.00
4,3.0,2.00
...,...,...
6002,3.0,1.00
6003,4.0,2.25
6004,5.0,2.00
6005,4.0,3.50


In [44]:
df_bedbath['zeroes'] = 0 #creating new column, filling with zeroes

In [45]:
df_bedbath.head()

Unnamed: 0,bedrooms,bathrooms,zeroes
0,,1.0,0
1,3.0,2.25,0
2,2.0,1.0,0
3,,3.0,0
4,3.0,2.0,0
