In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [3]:
df = pd.read_csv('../assets/input/train.csv')
weather = pd.read_csv('../assets/input/weather.csv')

## Climate Cleaning

In [3]:
# Calculate and fill in Tavg using half-up rounding to maintain parity with existing data.
weather['Tavg'] = np.ceil(((weather['Tmax'] + weather['Tmin']) / 2) - .4999).astype(int)

In [4]:
# If Tavg - Depart is the expected temperature for that day, that calculation can
# be used to fill in missing values for Station 2 using data from Station 1.

# In order to run the following list on the Depart series, it is necessary to
# homogenize the datatypes. By replacing the elements that keep it an object
# and casting it as a series of ints, we enable manipulation of the numerics.
weather['Depart'].replace('M', None, inplace=True)
weather['Depart'] = weather['Depart'].astype(int)

# Replace incorrect values for Depart for Station 2 with properly calculated
# values from Station 1 on the same days.
i = 1
while i < len(weather):
    normal = weather['Tavg'].iloc[i - 1] - weather['Depart'].iloc[i - 1]
    weather['Depart'].iloc[i] = weather['Tavg'].iloc[i] - normal
    i += 2
    
# Think of the RAM and dealloc!
del(normal, i)

In [20]:
# WetBulb doesn't have a clear way to calculate missing values, however the
# discrepencies between Station 1 and 2's measurements are within 2 degrees and
# often coincide directly. Since we are already resigned the fact that models
# are not perfect emulations of reality, I don't believe it to be unfair to
# suggest replacing the four missing values with the counterpart station's
# measurement.

# Just as with Depart, it is necessary to first homogenize the datatype of the
# series in order to manipulate it. First we need the index values of the cells
# in question.
ixlist = list(weather[weather['WetBulb'] == 'M'].index.values)

# Replace 'M's with None and typecast as int.
weather['WetBulb'].replace('M', None, inplace=True)
weather['WetBulb'] = weather['WetBulb'].astype(int)

# Using mod, it is easy to ensure the data is being copied from the correct day.
for i in ixlist:
    if i % 2 == 0:
        weather['WetBulb'].iloc[i] = weather['WetBulb'].iloc[i + 1]
    else:
        weather['WetBulb'].iloc[i] = weather['WetBulb'].iloc[i - 1]

# Liberate your memory!
del(ixlist)

In [50]:
# Heat and Cool are easily computable given that the base value is given at 65.
# First things first, replace and typecast:
weather['Heat'].replace('M', None, inplace=True)
weather['Cool'].replace('M', None, inplace=True)
weather['Heat'] = weather['Heat'].astype(int)
weather['Cool'] = weather['Cool'].astype(int)

# So it turns out that going through all the rows is more time consuming than
# anticipated. If you made the mistake of running this cell, go ahead and grab
# yourself a coffee.
for i in range(len(weather)):
    if weather['Tavg'].iloc[i] >= 65:
        weather['Heat'].iloc[i] = 0
        weather['Cool'].iloc[i] = weather['Tavg'].iloc[i] - 65
    else:
        weather['Cool'].iloc[i] = 0
        weather['Heat'].iloc[i] = 65 - weather['Tavg'].iloc[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [60]:
# Sunrise and sunset are caluclated rather than observed and only missing for
# one station. Its just a simple transcription job. Replace and typecast:
weather['Sunrise'].replace('-', None, inplace = True)
weather['Sunset'].replace('-', None, inplace = True)
weather['Sunrise'] = weather['Sunrise'].astype(int)
weather['Sunset'] = weather['Sunset'].astype(int)

# Much like with Depart, we merely need to alter every other row
i = 1
while i < len(weather):
    weather['Sunrise'].iloc[i] = weather['Sunrise'].iloc[i - 1]
    weather['Sunset'].iloc[i] = weather['Sunset'].iloc[i - 1]
    i += 2

# Every little bit helps
del(i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [66]:
# All depth and water values are 0. Since there is no overwriting at play here,
# there is no need to replace and typecast before reassigning.
weather['Depth'] = 0
weather['Water1'] = 0

In [None]:
# So far all the replacements made have been justifiable given the data. In the
# case of Snowfall, slight liberties will be taken. Namely, 'T', for 'trace'
# will be replaced with 0.05 which is half the minimum quantified value in
# order to preserve the observation. Also, M's will be replaced with 0.0
# except in cases where one station has a numeric measurement and the other
# has none at all.

In [72]:
weather[weather['SnowFall'] == 'T']

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed


In [7]:
ixlist = list(weather[weather['SnowFall'] == 'T'].index.values)

In [8]:
for i in ixlist:
    print(weather.iloc[i])

In [68]:
weather['SnowFall'].value_counts()

M      1472
0.0    1459
  T      12
0.1       1
Name: SnowFall, dtype: int64

In [None]:
weather[weather['Pr']]

In [74]:
weather['PrecipTotal'].value_counts()

0.00    1577
  T      318
0.01     127
0.02      63
0.03      46
0.04      36
0.05      32
0.12      28
0.08      28
0.06      27
0.07      23
0.09      21
0.16      21
0.14      20
0.11      20
0.17      17
0.28      15
0.19      14
0.13      14
0.18      14
0.15      13
0.20      13
0.25      11
0.26      11
0.23      11
0.24      10
0.10      10
0.29       9
0.43       9
0.31       9
        ... 
0.90       1
1.75       1
1.38       1
1.58       1
1.34       1
1.88       1
1.22       1
3.97       1
1.60       1
2.20       1
1.46       1
2.79       1
1.21       1
2.76       1
3.07       1
4.73       1
2.60       1
1.04       1
2.35       1
1.82       1
2.03       1
1.73       1
0.76       1
2.06       1
1.06       1
1.90       1
3.17       1
6.86       1
6.64       1
0.91       1
Name: PrecipTotal, Length: 168, dtype: int64

In [61]:
pd.set_option('display.max_columns', 22)
weather

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,448,1849,,0,M,0.0,0.00,29.10,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,15,51,57,0,3,448,1849,,M,M,M,0.00,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,447,1850,BR,0,M,0.0,0.00,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,-2,42,47,13,0,447,1850,BR HZ,M,M,M,0.00,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,446,1851,,0,M,0.0,0.00,29.39,30.12,11.7,7,11.9
5,2,2007-05-03,67,48,58,4,40,50,7,0,446,1851,HZ,M,M,M,0.00,29.46,30.12,12.9,6,13.2
6,1,2007-05-04,66,49,58,4,41,50,7,0,444,1852,RA,0,M,0.0,T,29.31,30.05,10.4,8,10.8
7,2,2007-05-04,78,51,65,11,42,50,0,0,444,1852,,M,M,M,0.00,29.36,30.04,10.1,7,10.4
8,1,2007-05-05,66,53,60,5,38,49,5,0,443,1853,,0,M,0.0,T,29.40,30.10,11.7,7,12.0
9,2,2007-05-05,66,54,60,5,39,50,5,0,443,1853,,M,M,M,T,29.46,30.09,11.2,7,11.5


In [67]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 22 columns):
Station        2944 non-null int64
Date           2944 non-null object
Tmax           2944 non-null int64
Tmin           2944 non-null int64
Tavg           2944 non-null int64
Depart         2944 non-null int64
DewPoint       2944 non-null int64
WetBulb        2944 non-null int64
Heat           2944 non-null int64
Cool           2944 non-null int64
Sunrise        2944 non-null int64
Sunset         2944 non-null int64
CodeSum        2944 non-null object
Depth          2944 non-null int64
Water1         2944 non-null int64
SnowFall       2944 non-null object
PrecipTotal    2944 non-null object
StnPressure    2944 non-null object
SeaLevel       2944 non-null object
ResultSpeed    2944 non-null float64
ResultDir      2944 non-null int64
AvgSpeed       2944 non-null object
dtypes: float64(1), int64(14), object(7)
memory usage: 506.1+ KB


In [None]:
weather[weather['AvgSpeed'] == 'M']

In [None]:
weather['Tavg'] = int(weather['Tavg'])