# Learning Pandas 1 - Introduction
* From reading datasets to how to deal with missing values.

In [1]:
# Importing Pandas library:

import pandas as pd

In [2]:
# Reading a .csv database. We'll be using one year of house sales data from a US county.
# *sep* refers to separator, *header* to column headers position.
# If *header* is not defined, numbers will be assigned automatically.

file = 'kc_house_data.csv'
dataset = pd.read_csv(file, sep=',' ,header=0)

In [3]:
# Finding out the type of the dataset.
# In a *dataframe*, lines can have columns with different types of data, like strings, integers, etc.

type(dataset)

pandas.core.frame.DataFrame

In [4]:
# *head()* prints the first lines of the dataset, 5 by default.

dataset.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2.0,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [5]:
# *tail()* does the same for the last lines, also 5 by default.

dataset.tail()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
21608,263000018,20140521T000000,360000.0,3.0,2.5,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,20150223T000000,400000.0,4.0,2.5,2310,5813,2.0,0,0,...,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,20140623T000000,402101.0,2.0,0.75,1020,1350,2.0,0,0,...,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,20150116T000000,400000.0,3.0,2.5,1600,2388,2.0,0,0,...,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287
21612,1523300157,20141015T000000,325000.0,2.0,0.75,1020,1076,2.0,0,0,...,7,1020,0,2008,0,98144,47.5941,-122.299,1020,1357


In [6]:
# *sample()* gives us a randomized sample from the dataset, just 1 by default.

dataset.sample()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
9132,1471700410,20150506T000000,310000.0,7.0,1.5,2660,15111,1.5,0,0,...,7,2660,0,1962,0,98059,47.4644,-122.066,1710,15429


In [7]:
# *count()* returns the number of lines below all columns:

dataset.count()

id               21613
date             21613
price            21613
bedrooms         21609
bathrooms        21613
sqft_living      21613
sqft_lot         21613
floors           21612
waterfront       21613
view             21613
condition        21613
grade            21613
sqft_above       21613
sqft_basement    21613
yr_built         21613
yr_renovated     21613
zipcode          21613
lat              21613
long             21613
sqft_living15    21613
sqft_lot15       21613
dtype: int64

In [8]:
# *Columns* attribute returns names of column headers:

dataset.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [9]:
# *describe()* prints statistical information about the database:

dataset.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613.0,21609.0,21613.0,21613.0,21613.0,21612.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,4580302000.0,540088.1,3.37091,2.114757,2079.899736,15106.97,1.494332,0.007542,0.234303,3.40943,7.656873,1788.390691,291.509045,1971.005136,84.402258,98077.939805,47.560053,-122.213896,1986.552492,12768.455652
std,2876566000.0,367127.2,0.930084,0.770163,918.440897,41420.51,0.539991,0.086517,0.766318,0.650743,1.175459,828.090978,442.575043,29.373411,401.67924,53.505026,0.138564,0.140828,685.391304,27304.179631
min,1000102.0,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,98001.0,47.1559,-122.519,399.0,651.0
25%,2123049000.0,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,98033.0,47.471,-122.328,1490.0,5100.0
50%,3904930000.0,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,0.0,1975.0,0.0,98065.0,47.5718,-122.23,1840.0,7620.0
75%,7308900000.0,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,2210.0,560.0,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0
max,9900000000.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,98199.0,47.7776,-121.315,6210.0,871200.0


In [10]:
# *Shape* attribute returns number of dataset lines and columns in tuple format:

dataset.shape

(21613, 21)

In [11]:
# *info()* returns information about columns and memory use:

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21609 non-null  float64
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21612 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [12]:
# It's important to manage memory use with large files.
# One way to do so is by reading a limited portion of the dataset at a time.
# We can split the data into *chunks* for this purpose.
# *chunksize()* parameter defines number of lines per chunk.

chunk = pd.read_csv(file, chunksize=10000)

In [13]:
type(chunk)

pandas.io.parsers.TextFileReader

In [14]:
# Checking chunk size using length method:

for part in chunk:
    print (len(part))

10000
10000
1613


In [15]:
# Working with correct data types also helps with memory use.
# Let's work with a different database for this example, the Titanic passenger manifest.

manifest = pd.read_csv("titanic.csv")

In [16]:
manifest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [17]:
# Special attention to *object* types (strings in "vanilla" Python).
# Certain *objects* can be altered to *categories* (finite lists) for memory optimization.
# 64-bit integers can also be changed to 32-bit for better memory use.

manifest.Sex = manifest.Sex.astype('category')
manifest.Embarked = manifest.Embarked.astype('category')
manifest.Survived = manifest.Survived.astype('category')
manifest.Pclass = manifest.Pclass.astype('category')
manifest.PassengerId = manifest.PassengerId.astype('int32')
manifest.Parch = manifest.Parch.astype('int32')
manifest.SibSp = manifest.SibSp.astype('int32')

In [18]:
manifest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int32   
 1   Survived     891 non-null    category
 2   Pclass       891 non-null    category
 3   Name         891 non-null    object  
 4   Sex          891 non-null    category
 5   Age          714 non-null    float64 
 6   SibSp        891 non-null    int32   
 7   Parch        891 non-null    int32   
 8   Ticket       891 non-null    object  
 9   Fare         891 non-null    float64 
 10  Cabin        204 non-null    object  
 11  Embarked     889 non-null    category
dtypes: category(4), float64(2), int32(3), object(3)
memory usage: 49.2+ KB


In [19]:
# Almost 50% less memory use! (83.7+ KB to 49.2+ KB)

In [20]:
# We can run all types of queries on data using logical operators, much like in SQL.
# Let's do some examples. Going back to the house sales data, how many unique values are there for bedrooms?

pd.value_counts(dataset['bedrooms'])

3.0     9822
4.0     6881
2.0     2759
5.0     1601
6.0      272
1.0      199
7.0       38
8.0       13
0.0       13
9.0        6
10.0       3
11.0       1
33.0       1
Name: bedrooms, dtype: int64

In [21]:
# *loc()* accesses a group of rows/columns using a parameter (label, list/array of labels or boolean array)
# Query for 3-bedroom houses:

dataset.loc[dataset['bedrooms'] == 3]

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.00,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
4,1954400510,20150218T000000,510000.0,3.0,2.00,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
6,1321400060,20140627T000000,257500.0,3.0,2.25,1715,6819,2.0,0,0,...,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
7,2008000270,20150115T000000,291850.0,3.0,1.50,1060,9711,1.0,0,0,...,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21603,7852140040,20140825T000000,507250.0,3.0,2.50,2270,5536,2.0,0,0,...,8,2270,0,2003,0,98065,47.5389,-121.881,2270,5731
21604,9834201367,20150126T000000,429000.0,3.0,2.00,1490,1126,3.0,0,0,...,8,1490,0,2014,0,98144,47.5699,-122.288,1400,1230
21607,2997800021,20150219T000000,475000.0,3.0,2.50,1310,1294,2.0,0,0,...,8,1180,130,2008,0,98116,47.5773,-122.409,1330,1265
21608,263000018,20140521T000000,360000.0,3.0,2.50,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509


In [22]:
# We can use "&" operator to combine parameters.
# Query for 3-bedroom houses with 2 or more bathrooms:

dataset.loc[(dataset['bedrooms']==3) & (dataset['bathrooms'] >= 2)]

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
4,1954400510,20150218T000000,510000.0,3.0,2.00,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
6,1321400060,20140627T000000,257500.0,3.0,2.25,1715,6819,2.0,0,0,...,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
9,3793500160,20150312T000000,323000.0,3.0,2.50,1890,6560,2.0,0,0,...,7,1890,0,2003,0,98038,47.3684,-122.031,2390,7570
10,1736800520,20150403T000000,662500.0,3.0,2.50,3560,9796,1.0,0,0,...,8,1860,1700,1965,0,98007,47.6007,-122.145,2210,8925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21603,7852140040,20140825T000000,507250.0,3.0,2.50,2270,5536,2.0,0,0,...,8,2270,0,2003,0,98065,47.5389,-121.881,2270,5731
21604,9834201367,20150126T000000,429000.0,3.0,2.00,1490,1126,3.0,0,0,...,8,1490,0,2014,0,98144,47.5699,-122.288,1400,1230
21607,2997800021,20150219T000000,475000.0,3.0,2.50,1310,1294,2.0,0,0,...,8,1180,130,2008,0,98116,47.5773,-122.409,1330,1265
21608,263000018,20140521T000000,360000.0,3.0,2.50,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509


In [23]:
# *sort_values()* orders dataset by given parameter.
# Order by ascending price:

dataset.sort_values(by='price', ascending=True)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
1149,3421079032,20150217T000000,75000.0,1.0,0.00,670,43377,1.0,0,0,...,3,670,0,1966,0,98022,47.2638,-121.906,1160,42882
15293,40000362,20140506T000000,78000.0,2.0,1.00,780,16344,1.0,0,0,...,5,780,0,1942,0,98168,47.4739,-122.280,1700,10387
465,8658300340,20140523T000000,80000.0,1.0,0.75,430,5050,1.0,0,0,...,4,430,0,1912,0,98014,47.6499,-121.909,1200,7500
16198,3028200080,20150324T000000,81000.0,2.0,1.00,730,9975,1.0,0,0,...,5,730,0,1943,0,98168,47.4808,-122.315,860,9000
8274,3883800011,20141105T000000,82000.0,3.0,1.00,860,10426,1.0,0,0,...,6,860,0,1954,0,98146,47.4987,-122.341,1140,11250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1448,8907500070,20150413T000000,5350000.0,5.0,5.00,8000,23985,2.0,0,4,...,12,6720,1280,2009,0,98004,47.6232,-122.220,4600,21750
4411,2470100110,20140804T000000,5570000.0,5.0,5.75,9200,35069,2.0,0,0,...,13,6200,3000,2001,0,98039,47.6289,-122.233,3560,24345
9254,9208900037,20140919T000000,6885000.0,6.0,7.75,9890,31374,2.0,0,4,...,13,8860,1030,2001,0,98039,47.6305,-122.240,4540,42730
3914,9808700762,20140611T000000,7062500.0,5.0,4.50,10040,37325,2.0,1,2,...,11,7680,2360,1940,2001,98004,47.6500,-122.214,3930,25449


In [24]:
# We can alter the dataset itself as well.
# Let's add an example column converting lot size to square meters and visualize the first 5 rows:

dataset['sqmt_lot'] = (dataset['sqft_lot'] * 0.092903)
dataset['sqmt_lot'].head()

0    524.901950
1    672.803526
2    929.030000
3    464.515000
4    750.656240
Name: sqmt_lot, dtype: float64

In [25]:
# Creating a classification function:

def categorize(s):
    if s >= 1000:
        return 'Large'
    elif s >= 500:
        return 'Medium'
    elif s >= 250:
        return 'Small'

In [26]:
# Adding a column with the classification labels and visualizing the first 5 rows:

dataset['cat_size'] = dataset['sqmt_lot'].apply(categorize)
dataset['cat_size'].head()

0    Medium
1    Medium
2    Medium
3     Small
4    Medium
Name: cat_size, dtype: object

In [27]:
# Looking at the distribution of new values with *value_counts()*:

pd.value_counts(dataset['cat_size'])

Medium    10214
Large      5333
Small      4460
Name: cat_size, dtype: int64

In [28]:
# To exclude data, we use *drop()*.
# Parameter "axis=1" defines that we want to exclude a column and not a row (index labels).
# Parameter "inplace=True" makes the change in memory, returning a copy of the dataset (no alteration on the original).

dataset.drop(['cat_size'], axis=1, inplace=True)
dataset.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,sqmt_lot
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180,5650,1.0,0,0,...,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,524.90195
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,672.803526
2,5631500400,20150225T000000,180000.0,2.0,1.0,770,10000,1.0,0,0,...,770,0,1933,0,98028,47.7379,-122.233,2720,8062,929.03
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960,5000,1.0,0,0,...,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,464.515
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680,8080,1.0,0,0,...,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,750.65624


In [29]:
# We can drop data according to logical conditions as well.
# For example, let's exclude items with no bedrooms.

dataset.drop(dataset[dataset.bedrooms==0].index, inplace=True)

In [30]:
# *iterrows()* is a generator function that iterates over dataset rows.

dataset.iterrows()

<generator object DataFrame.iterrows at 0x00000212F95EBAC8>

In [31]:
# Using *next()* we can manually iterate through each item:

next(dataset.iterrows())

(0,
 id                    7129300520
 date             20141013T000000
 price                     221900
 bedrooms                       3
 bathrooms                      1
 sqft_living                 1180
 sqft_lot                    5650
 floors                         1
 waterfront                     0
 view                           0
 condition                      3
 grade                          7
 sqft_above                  1180
 sqft_basement                  0
 yr_built                    1955
 yr_renovated                   0
 zipcode                    98178
 lat                      47.5112
 long                    -122.257
 sqft_living15               1340
 sqft_lot15                  5650
 sqmt_lot                 524.902
 Name: 0, dtype: object)

In [32]:
# Example for iterating through a set of rows:

for index, row in dataset.head(10).iterrows():
     print(index, row)

0 id                    7129300520
date             20141013T000000
price                     221900
bedrooms                       3
bathrooms                      1
sqft_living                 1180
sqft_lot                    5650
floors                         1
waterfront                     0
view                           0
condition                      3
grade                          7
sqft_above                  1180
sqft_basement                  0
yr_built                    1955
yr_renovated                   0
zipcode                    98178
lat                      47.5112
long                    -122.257
sqft_living15               1340
sqft_lot15                  5650
sqmt_lot                 524.902
Name: 0, dtype: object
1 id                    6414100192
date             20141209T000000
price                     538000
bedrooms                       3
bathrooms                   2.25
sqft_living                 2570
sqft_lot                    7242
floors          

In [33]:
# Showing only selected columns:

for index, row in dataset.head(10).iterrows():
     print(index, row['id'], row['yr_built'], row['sqmt_lot'])

0 7129300520 1955 524.9019499999999
1 6414100192 1951 672.803526
2 5631500400 1933 929.03
3 2487200875 1965 464.515
4 1954400510 1987 750.65624
5 7237550310 2001 9469.60279
6 1321400060 1995 633.505557
7 2008000270 1963 902.181033
8 2414600126 1960 693.98541
9 3793500160 2003 609.44368


In [34]:
# It's possible to use *iterrows()* to alter data as well. Let's simulate a 15% price discount.
# Reviewing the first 5 rows before change:

dataset.price.head()

0    221900.0
1    538000.0
2    180000.0
3    604000.0
4    510000.0
Name: price, dtype: float64

In [35]:
# Note the "at()" method:

for index, row in dataset.iterrows():
    dataset.at[index , 'price'] = row['price'] * 0.85

In [36]:
# Result:

dataset.price.head()

0    188615.0
1    457300.0
2    153000.0
3    513400.0
4    433500.0
Name: price, dtype: float64

In [37]:
# Alternatively, we can *itertuples()* to return values as tuples instead of lists.
# This function is generally faster than *iterrows()*.

for row in dataset.head().itertuples():
    print(row)

Pandas(Index=0, id=7129300520, date='20141013T000000', price=188615.0, bedrooms=3.0, bathrooms=1.0, sqft_living=1180, sqft_lot=5650, floors=1.0, waterfront=0, view=0, condition=3, grade=7, sqft_above=1180, sqft_basement=0, yr_built=1955, yr_renovated=0, zipcode=98178, lat=47.5112, long=-122.257, sqft_living15=1340, sqft_lot15=5650, sqmt_lot=524.9019499999999)
Pandas(Index=1, id=6414100192, date='20141209T000000', price=457300.0, bedrooms=3.0, bathrooms=2.25, sqft_living=2570, sqft_lot=7242, floors=2.0, waterfront=0, view=0, condition=3, grade=7, sqft_above=2170, sqft_basement=400, yr_built=1951, yr_renovated=1991, zipcode=98125, lat=47.721000000000004, long=-122.319, sqft_living15=1690, sqft_lot15=7639, sqmt_lot=672.803526)
Pandas(Index=2, id=5631500400, date='20150225T000000', price=153000.0, bedrooms=2.0, bathrooms=1.0, sqft_living=770, sqft_lot=10000, floors=1.0, waterfront=0, view=0, condition=3, grade=6, sqft_above=770, sqft_basement=0, yr_built=1933, yr_renovated=0, zipcode=98028

In [38]:
# Printing rows by column name with *itertuples()*:

for row in dataset.head().itertuples():
    print(row.id, row.bedrooms, row.price)

7129300520 3.0 188615.0
6414100192 3.0 457300.0
5631500400 2.0 153000.0
2487200875 4.0 513400.0
1954400510 3.0 433500.0


In [39]:
# Missing values are a common problem. They can affect data analysis and machine learning algorithms.
# Luckily, there are many ways to deal with this. Let's start by identifying where they are:

dataset.isnull().sum()

id               0
date             0
price            0
bedrooms         4
bathrooms        0
sqft_living      0
sqft_lot         0
floors           1
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
sqmt_lot         0
dtype: int64

In [40]:
# *dropna()* works as a shortcut to remove rows where at least one value is missing:

dataset.dropna(inplace=True)

In [41]:
# We can also remove only those rows or columns where every value is missing:

dataset.dropna(how='all', inplace=True)

In [42]:
# Alternatively, we can fill up the missing values in various ways.
# Simply replacing the null values for "bedrooms" with "1":

dataset['bedrooms'].fillna(1, inplace=True)

In [43]:
# Replacing null "floors" values with an average:

dataset['floors'].fillna(dataset['floors'].mean(), inplace=True)