In [1]:
import numpy as np
import pandas as pd
import requests
import bs4
import json
import datetime as dt
import time
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.metrics import precision_score, recall_score, f1_score
from scipy.stats import ttest_ind, ttest_rel
import cPickle as pickle
import matplotlib.pyplot as plt
import seaborn as sns
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [108]:
df = pd.read_csv('data/all_alpha_15.txt', sep='\t')

In [3]:
df.columns = ['model', 'displ', 'cyl', 'trans', 'drive', 'fuel', 'cert_region',
       'stnd', 'stnd_description', 'underhood_id', 'veh_class',
       'air_pollution_score', 'city_mpg', 'hwy_mpg', 'cmb_mpg',
       'greenhouse_gas_score', 'smartway', 'comb_co2']

In [4]:
# Removing cars with 0 emissions
df = df[df['fuel'] != 'Electricity']
df = df[df['fuel'] != 'Gasoline/Electricity']
df = df[df['fuel'] != 'Hydrogen']

In [5]:
df.head()

Unnamed: 0,model,displ,cyl,trans,drive,fuel,cert_region,stnd,stnd_description,underhood_id,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway,comb_co2
0,ACURA ILX,2.0,4,SemiAuto-5,2WD,Gasoline,FA,B5,Federal Tier 2 Bin 5,FHNXV02.0JA3,small car,5,24,35,28,7,No,316
1,ACURA ILX,2.0,4,SemiAuto-5,2WD,Gasoline,CA,U2,California LEV-II ULEV,FHNXV02.0JA3,small car,6,24,35,28,7,Yes,316
2,ACURA ILX,2.4,4,Man-6,2WD,Gasoline,FA,B5,Federal Tier 2 Bin 5,FHNXV02.4KA3,small car,5,22,31,25,6,No,350
3,ACURA ILX,2.4,4,Man-6,2WD,Gasoline,CA,U2,California LEV-II ULEV,FHNXV02.4KA3,small car,6,22,31,25,6,No,350
4,ACURA MDX,3.5,6,SemiAuto-6,4WD,Gasoline,FA,B5,Federal Tier 2 Bin 5,FHNXV03.5XA4,small SUV,5,18,27,21,5,No,416


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2641 entries, 0 to 2698
Data columns (total 18 columns):
model                   2641 non-null object
displ                   2641 non-null float64
cyl                     2641 non-null float64
trans                   2641 non-null object
drive                   2641 non-null object
fuel                    2641 non-null object
cert_region             2641 non-null object
stnd                    2641 non-null object
stnd_description        2641 non-null object
underhood_id            2641 non-null object
veh_class               2641 non-null object
air_pollution_score     2641 non-null int64
city_mpg                2333 non-null object
hwy_mpg                 2333 non-null object
cmb_mpg                 2333 non-null object
greenhouse_gas_score    2333 non-null float64
smartway                2641 non-null object
comb_co2                2333 non-null object
dtypes: float64(3), int64(1), object(14)
memory usage: 392.0+ KB


In [7]:
df_eth_dirty = df[df['fuel'] == 'Ethanol/Gas']
df_gas_dirty = df[df['fuel'] == 'Ethanol/Gas']
df_cng_dirty = df[df['fuel'] == 'CNG/Gasoline']
df_gasoline_dirty = df[df['fuel'] == 'CNG/Gasoline']

In [8]:
df_eth_dirty.loc[:, 'fuel'] = 'Ethanol'
df_gas_dirty.loc[:, 'fuel'] = 'eGas'
df_cng_dirty.loc[:, 'fuel'] = 'CNG'
df_gasoline_dirty.loc[:, 'fuel'] = 'Gasoline'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [9]:
# Converting NaN's to 0
df_eth_dirty[df_eth_dirty.loc[:, 'city_mpg'].isnull()] = df_eth_dirty[df_eth_dirty.loc[:, 'city_mpg'].isnull()].fillna('0')
df_gas_dirty[df_gas_dirty.loc[:, 'city_mpg'].isnull()] = df_gas_dirty[df_gas_dirty.loc[:, 'city_mpg'].isnull()].fillna('0')
df_eth_dirty[df_eth_dirty.loc[:, 'hwy_mpg'].isnull()] = df_eth_dirty[df_eth_dirty.loc[:, 'hwy_mpg'].isnull()].fillna('0')
df_gas_dirty[df_gas_dirty.loc[:, 'hwy_mpg'].isnull()] = df_gas_dirty[df_gas_dirty.loc[:, 'hwy_mpg'].isnull()].fillna('0')
df_eth_dirty[df_eth_dirty.loc[:, 'cmb_mpg'].isnull()] = df_eth_dirty[df_eth_dirty.loc[:, 'cmb_mpg'].isnull()].fillna('0')
df_gas_dirty[df_gas_dirty.loc[:, 'cmb_mpg'].isnull()] = df_gas_dirty[df_gas_dirty.loc[:, 'cmb_mpg'].isnull()].fillna('0')
df_eth_dirty[df_eth_dirty.loc[:, 'comb_co2'].isnull()] = df_eth_dirty[df_eth_dirty.loc[:, 'comb_co2'].isnull()].fillna('0')
df_gas_dirty[df_gas_dirty.loc[:, 'comb_co2'].isnull()] = df_gas_dirty[df_gas_dirty.loc[:, 'comb_co2'].isnull()].fillna('0')

df_cng_dirty[df_cng_dirty.loc[:, 'city_mpg'].isnull()] = df_cng_dirty[df_cng_dirty.loc[:, 'city_mpg'].isnull()].fillna('0')
df_gasoline_dirty[df_gasoline_dirty.loc[:, 'city_mpg'].isnull()] = df_gasoline_dirty[df_gasoline_dirty.loc[:, 'city_mpg'].isnull()].fillna('0')
df_cng_dirty[df_cng_dirty.loc[:, 'hwy_mpg'].isnull()] = df_cng_dirty[df_cng_dirty.loc[:, 'hwy_mpg'].isnull()].fillna('0')
df_gasoline_dirty[df_gasoline_dirty.loc[:, 'hwy_mpg'].isnull()] = df_gasoline_dirty[df_gasoline_dirty.loc[:, 'hwy_mpg'].isnull()].fillna('0')
df_cng_dirty[df_cng_dirty.loc[:, 'cmb_mpg'].isnull()] = df_cng_dirty[df_cng_dirty.loc[:, 'cmb_mpg'].isnull()].fillna('0')
df_gasoline_dirty[df_gasoline_dirty.loc[:, 'cmb_mpg'].isnull()] = df_gasoline_dirty[df_gasoline_dirty.loc[:, 'cmb_mpg'].isnull()].fillna('0')
df_cng_dirty[df_cng_dirty.loc[:, 'comb_co2'].isnull()] = df_cng_dirty[df_cng_dirty.loc[:, 'comb_co2'].isnull()].fillna('0')
df_gasoline_dirty[df_gasoline_dirty.loc[:, 'comb_co2'].isnull()] = df_gasoline_dirty[df_gasoline_dirty.loc[:, 'comb_co2'].isnull()].fillna('0')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http:/

In [10]:
# Seperate Ethanol and gas values
df_eth_dirty.loc[:, 'city_mpg'] = df_eth_dirty.loc[:, 'city_mpg'].str.split('/')
df_eth_dirty.loc[:, 'hwy_mpg'] = df_eth_dirty.loc[:, 'hwy_mpg'].str.split('/')
df_eth_dirty.loc[:, 'cmb_mpg'] = df_eth_dirty.loc[:, 'cmb_mpg'].str.split('/')
df_eth_dirty.loc[:, 'comb_co2'] = df_eth_dirty.loc[:, 'comb_co2'].str.split('/')
df_gas_dirty.loc[:, 'city_mpg'] = df_gas_dirty.loc[:, 'city_mpg'].str.split('/')
df_gas_dirty.loc[:, 'hwy_mpg'] = df_gas_dirty.loc[:, 'hwy_mpg'].str.split('/')
df_gas_dirty.loc[:, 'cmb_mpg'] = df_gas_dirty.loc[:, 'cmb_mpg'].str.split('/')
df_gas_dirty.loc[:, 'comb_co2'] = df_gas_dirty.loc[:, 'comb_co2'].str.split('/')

# Seperate CNG and Gasoline values
df_cng_dirty.loc[:, 'city_mpg'] = df_cng_dirty.loc[:, 'city_mpg'].str.split('/')
df_cng_dirty.loc[:, 'hwy_mpg'] = df_cng_dirty.loc[:, 'hwy_mpg'].str.split('/')
df_cng_dirty.loc[:, 'cmb_mpg'] = df_cng_dirty.loc[:, 'cmb_mpg'].str.split('/')
df_cng_dirty.loc[:, 'comb_co2'] = df_cng_dirty.loc[:, 'comb_co2'].str.split('/')
df_gasoline_dirty.loc[:, 'city_mpg'] = df_gasoline_dirty.loc[:, 'city_mpg'].str.split('/')
df_gasoline_dirty.loc[:, 'hwy_mpg'] = df_gasoline_dirty.loc[:, 'hwy_mpg'].str.split('/')
df_gasoline_dirty.loc[:, 'cmb_mpg'] = df_gasoline_dirty.loc[:, 'cmb_mpg'].str.split('/')
df_gasoline_dirty.loc[:, 'comb_co2'] = df_gasoline_dirty.loc[:, 'comb_co2'].str.split('/')

In [11]:
# Convert non-helpful values to 0
# Good --------- Changed gas[eth] to gas[gas]
df_eth_dirty.ix[df_eth_dirty['city_mpg'].map(lambda x: len(x) < 2), 'city_mpg'] = df_eth_dirty[df_eth_dirty['city_mpg'].map(lambda x: len(x) < 2)]['city_mpg'].apply(lambda x: [0])
df_eth_dirty.ix[df_eth_dirty['hwy_mpg'].map(lambda x: len(x) < 2), 'hwy_mpg'] = df_eth_dirty[df_eth_dirty['hwy_mpg'].map(lambda x: len(x) < 2)]['hwy_mpg'].apply(lambda x: [0])
df_eth_dirty.ix[df_eth_dirty['cmb_mpg'].map(lambda x: len(x) < 2), 'cmb_mpg'] = df_eth_dirty[df_eth_dirty['cmb_mpg'].map(lambda x: len(x) < 2)]['cmb_mpg'].apply(lambda x: [0])
df_eth_dirty.ix[df_eth_dirty['comb_co2'].map(lambda x: len(x) < 2), 'comb_co2'] = df_eth_dirty[df_eth_dirty['comb_co2'].map(lambda x: len(x) < 2)]['comb_co2'].apply(lambda x: [0])
df_gas_dirty.ix[df_gas_dirty['city_mpg'].map(lambda x: len(x) < 2), 'city_mpg'] = df_gas_dirty[df_gas_dirty['city_mpg'].map(lambda x: len(x) < 2)]['city_mpg'].apply(lambda x: [0])
df_gas_dirty.ix[df_gas_dirty['hwy_mpg'].map(lambda x: len(x) < 2), 'hwy_mpg'] = df_gas_dirty[df_gas_dirty['hwy_mpg'].map(lambda x: len(x) < 2)]['hwy_mpg'].apply(lambda x: [0])
df_gas_dirty.ix[df_gas_dirty['cmb_mpg'].map(lambda x: len(x) < 2), 'cmb_mpg'] = df_gas_dirty[df_gas_dirty['cmb_mpg'].map(lambda x: len(x) < 2)]['cmb_mpg'].apply(lambda x: [0])
df_gas_dirty.ix[df_gas_dirty['comb_co2'].map(lambda x: len(x) < 2), 'comb_co2'] = df_gas_dirty[df_gas_dirty['comb_co2'].map(lambda x: len(x) < 2)]['comb_co2'].apply(lambda x: [0])

df_cng_dirty.ix[df_cng_dirty['city_mpg'].map(lambda x: len(x) < 2), 'city_mpg'] = df_cng_dirty[df_cng_dirty['city_mpg'].map(lambda x: len(x) < 2)]['city_mpg'].apply(lambda x: [0])
df_cng_dirty.ix[df_cng_dirty['hwy_mpg'].map(lambda x: len(x) < 2), 'hwy_mpg'] = df_cng_dirty[df_cng_dirty['hwy_mpg'].map(lambda x: len(x) < 2)]['hwy_mpg'].apply(lambda x: [0])
df_cng_dirty.ix[df_cng_dirty['cmb_mpg'].map(lambda x: len(x) < 2), 'cmb_mpg'] = df_cng_dirty[df_cng_dirty['cmb_mpg'].map(lambda x: len(x) < 2)]['cmb_mpg'].apply(lambda x: [0])
df_cng_dirty.ix[df_cng_dirty['comb_co2'].map(lambda x: len(x) < 2), 'comb_co2'] = df_cng_dirty[df_cng_dirty['comb_co2'].map(lambda x: len(x) < 2)]['comb_co2'].apply(lambda x: [0])
df_gasoline_dirty.ix[df_gasoline_dirty['city_mpg'].map(lambda x: len(x) < 2), 'city_mpg'] = df_gasoline_dirty[df_gasoline_dirty['city_mpg'].map(lambda x: len(x) < 2)]['city_mpg'].apply(lambda x: [0])
df_gasoline_dirty.ix[df_gasoline_dirty['hwy_mpg'].map(lambda x: len(x) < 2), 'hwy_mpg'] = df_gasoline_dirty[df_gasoline_dirty['hwy_mpg'].map(lambda x: len(x) < 2)]['hwy_mpg'].apply(lambda x: [0])
df_gasoline_dirty.ix[df_gasoline_dirty['cmb_mpg'].map(lambda x: len(x) < 2), 'cmb_mpg'] = df_gasoline_dirty[df_gasoline_dirty['cmb_mpg'].map(lambda x: len(x) < 2)]['cmb_mpg'].apply(lambda x: [0])
df_gasoline_dirty.ix[df_gasoline_dirty['comb_co2'].map(lambda x: len(x) < 2), 'comb_co2'] = df_gasoline_dirty[df_gasoline_dirty['comb_co2'].map(lambda x: len(x) < 2)]['comb_co2'].apply(lambda x: [0])

In [12]:
# Grab correct value for ethanol or gas for respective tables
df_eth_dirty.loc[:, 'city_mpg'] = df_eth_dirty.loc[:, 'city_mpg'].apply(lambda x: [x[0]])
df_eth_dirty.loc[:, 'hwy_mpg'] = df_eth_dirty.loc[:, 'hwy_mpg'].apply(lambda x: [x[0]])
df_eth_dirty.loc[:, 'cmb_mpg'] = df_eth_dirty.loc[:, 'cmb_mpg'].apply(lambda x: [x[0]])
df_eth_dirty.loc[:, 'comb_co2'] = df_eth_dirty.loc[:, 'comb_co2'].apply(lambda x: [x[0]])
df_gas_dirty.ix[df_gas_dirty['city_mpg'].map(lambda x: len(x) > 1), 'city_mpg'] = df_gas_dirty.ix[df_gas_dirty['city_mpg'].map(lambda x: len(x) > 1), 'city_mpg'].apply(lambda x: [x[1]])
df_gas_dirty.ix[df_gas_dirty['hwy_mpg'].map(lambda x: len(x) > 1), 'hwy_mpg'] = df_gas_dirty.ix[df_gas_dirty['hwy_mpg'].map(lambda x: len(x) > 1), 'hwy_mpg'].apply(lambda x: [x[1]])
df_gas_dirty.ix[df_gas_dirty['cmb_mpg'].map(lambda x: len(x) > 1), 'cmb_mpg'] = df_gas_dirty.ix[df_gas_dirty['cmb_mpg'].map(lambda x: len(x) > 1), 'cmb_mpg'].apply(lambda x: [x[1]])
df_gas_dirty.ix[df_gas_dirty['comb_co2'].map(lambda x: len(x) > 1), 'comb_co2'] = df_gas_dirty.ix[df_gas_dirty['comb_co2'].map(lambda x: len(x) > 1), 'comb_co2'].apply(lambda x: [x[1]])

# Grab correct value for CNG and Gasoline for respective tables
df_cng_dirty.ix[df_cng_dirty['city_mpg'].map(lambda x: len(x) > 1), 'city_mpg'] = df_gas_dirty.ix[df_gas_dirty['city_mpg'].map(lambda x: len(x) > 1), 'city_mpg'].apply(lambda x: [x[0]])
df_cng_dirty.ix[df_cng_dirty['hwy_mpg'].map(lambda x: len(x) > 1), 'hwy_mpg'] = df_gas_dirty.ix[df_gas_dirty['hwy_mpg'].map(lambda x: len(x) > 1), 'hwy_mpg'].apply(lambda x: [x[0]])
df_cng_dirty.ix[df_cng_dirty['cmb_mpg'].map(lambda x: len(x) > 1), 'cmb_mpg'] = df_gas_dirty.ix[df_gas_dirty['cmb_mpg'].map(lambda x: len(x) > 1), 'cmb_mpg'].apply(lambda x: [x[0]])
df_cng_dirty.ix[df_cng_dirty['comb_co2'].map(lambda x: len(x) > 1), 'comb_co2'] = df_gas_dirty.ix[df_gas_dirty['comb_co2'].map(lambda x: len(x) > 1), 'comb_co2'].apply(lambda x: [x[0]])
df_gasoline_dirty.ix[df_gasoline_dirty['city_mpg'].map(lambda x: len(x) > 1), 'city_mpg'] = df_gasoline_dirty.ix[df_gasoline_dirty['city_mpg'].map(lambda x: len(x) > 1), 'city_mpg'].apply(lambda x: [x[1]])
df_gasoline_dirty.ix[df_gasoline_dirty['hwy_mpg'].map(lambda x: len(x) > 1), 'hwy_mpg'] = df_gasoline_dirty.ix[df_gasoline_dirty['hwy_mpg'].map(lambda x: len(x) > 1), 'hwy_mpg'].apply(lambda x: [x[1]])
df_gasoline_dirty.ix[df_gasoline_dirty['cmb_mpg'].map(lambda x: len(x) > 1), 'cmb_mpg'] = df_gasoline_dirty.ix[df_gasoline_dirty['cmb_mpg'].map(lambda x: len(x) > 1), 'cmb_mpg'].apply(lambda x: [x[1]])
df_gasoline_dirty.ix[df_gasoline_dirty['comb_co2'].map(lambda x: len(x) > 1), 'comb_co2'] = df_gasoline_dirty.ix[df_gasoline_dirty['comb_co2'].map(lambda x: len(x) > 1), 'comb_co2'].apply(lambda x: [x[1]])

In [13]:
# Making ints from lists
df_eth_dirty.loc[:, 'city_mpg'] = df_eth_dirty.loc[:, 'city_mpg'].apply(lambda x: x[0])
df_eth_dirty.loc[:, 'hwy_mpg'] = df_eth_dirty.loc[:, 'hwy_mpg'].apply(lambda x: x[0])
df_eth_dirty.loc[:, 'cmb_mpg'] = df_eth_dirty.loc[:, 'cmb_mpg'].apply(lambda x: x[0])
df_eth_dirty.loc[:, 'comb_co2'] = df_eth_dirty.loc[:, 'comb_co2'].apply(lambda x: x[0])
df_gas_dirty.loc[:, 'city_mpg'] = df_gas_dirty.loc[:, 'city_mpg'].apply(lambda x: x[0])
df_gas_dirty.loc[:, 'hwy_mpg'] = df_gas_dirty.loc[:, 'hwy_mpg'].apply(lambda x: x[0])
df_gas_dirty.loc[:, 'cmb_mpg'] = df_gas_dirty.loc[:, 'cmb_mpg'].apply(lambda x: x[0])
df_gas_dirty.loc[:, 'comb_co2'] = df_gas_dirty.loc[:, 'comb_co2'].apply(lambda x: x[0])

df_cng_dirty.loc[:, 'city_mpg'] = df_cng_dirty.loc[:, 'city_mpg'].apply(lambda x: x[0])
df_cng_dirty.loc[:, 'hwy_mpg'] = df_cng_dirty.loc[:, 'hwy_mpg'].apply(lambda x: x[0])
df_cng_dirty.loc[:, 'cmb_mpg'] = df_cng_dirty.loc[:, 'cmb_mpg'].apply(lambda x: x[0])
df_cng_dirty.loc[:, 'comb_co2'] = df_cng_dirty.loc[:, 'comb_co2'].apply(lambda x: x[0])
df_gasoline_dirty.loc[:, 'city_mpg'] = df_gasoline_dirty.loc[:, 'city_mpg'].apply(lambda x: x[0])
df_gasoline_dirty.loc[:, 'hwy_mpg'] = df_gasoline_dirty.loc[:, 'hwy_mpg'].apply(lambda x: x[0])
df_gasoline_dirty.loc[:, 'cmb_mpg'] = df_gasoline_dirty.loc[:, 'cmb_mpg'].apply(lambda x: x[0])
df_gasoline_dirty.loc[:, 'comb_co2'] = df_gasoline_dirty.loc[:, 'comb_co2'].apply(lambda x: x[0])

In [14]:
df_eth_dirty[['city_mpg', 'hwy_mpg', 'cmb_mpg', 'comb_co2']] = df_eth_dirty[['city_mpg', 'hwy_mpg', 'cmb_mpg', 'comb_co2']].astype(int64)
df_gas_dirty[['city_mpg', 'hwy_mpg', 'cmb_mpg', 'comb_co2']] = df_gas_dirty[['city_mpg', 'hwy_mpg', 'cmb_mpg', 'comb_co2']].astype(int64)
df_cng_dirty[['city_mpg', 'hwy_mpg', 'cmb_mpg', 'comb_co2']] = df_cng_dirty[['city_mpg', 'hwy_mpg', 'cmb_mpg', 'comb_co2']].astype(int64)
df_gasoline_dirty[['city_mpg', 'hwy_mpg', 'cmb_mpg', 'comb_co2']] = df_gasoline_dirty[['city_mpg', 'hwy_mpg', 'cmb_mpg', 'comb_co2']].astype(int64)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [15]:
# Combine all df, drop fuel type, and sort newly formed df
df_dirty = pd.concat([df, df_eth_dirty, df_gas_dirty, df_cng_dirty, df_gasoline_dirty])
df_clean_temp = df_dirty[df_dirty['fuel'] != 'Ethanol/Gas']
df_clean = df_clean_temp[df_clean_temp['fuel'] != 'CNG/Gasoline']
df_clean.sort(['model', 'displ', 'cyl', 'trans', 'drive', 'fuel'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  inplace=inplace, kind=kind, na_position=na_position)


In [16]:
df_clean.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [17]:
# Setting new index
df_clean.index = range(2561)

# Converting to categorical variables

In [18]:
# Convert 'smartway' feature to int categorical
smartway_conv = {'No': 0, 'Yes': 1, 'Elite': 2}
df_clean.loc[:, 'smartway'] = df_clean.loc[:, 'smartway'].map(smartway_conv)

In [19]:
# Convert 'trans' feature to int categorical
# Automatic: 0, Manual: 1, CVT: 2
trans_conv = {'AMS-6': 0
              , 'AMS-7': 0
              , 'AMS-8': 0
              , 'Auto-4': 0
              , 'Auto-5': 0
              , 'Auto-6': 0
              , 'Auto-7': 0
              , 'Auto-8': 0
              , 'Auto-9': 0
              , 'AutoMan-6': 0
              , 'AutoMan-7': 0
              , 'AutoMan-8': 0
              , 'CVT': 2
              , 'Man-5': 1
              , 'Man-6': 1
              , 'Man-7': 1
              , 'SCV-6': 0
              , 'SCV-7': 0
              , 'SCV-8': 0
              , 'SemiAuto-5': 0
              , 'SemiAuto-6': 0
              , 'SemiAuto-7': 0
              , 'SemiAuto-8': 0
              , 'SemiAuto-9': 0}
trans_speed_conv = {'AMS-6': 6
                  , 'AMS-7': 7
                  , 'AMS-8': 8
                  , 'Auto-4': 4
                  , 'Auto-5': 5
                  , 'Auto-6': 6
                  , 'Auto-7': 7
                  , 'Auto-8': 8
                  , 'Auto-9': 9
                  , 'AutoMan-6': 6
                  , 'AutoMan-7': 7
                  , 'AutoMan-8': 8
                  , 'CVT': 0
                  , 'Man-5': 5
                  , 'Man-6': 6
                  , 'Man-7': 7
                  , 'SCV-6': 6
                  , 'SCV-7': 7
                  , 'SCV-8': 8
                  , 'SemiAuto-5': 5
                  , 'SemiAuto-6': 6
                  , 'SemiAuto-7': 7
                  , 'SemiAuto-8': 8
                  , 'SemiAuto-9': 9}
df_clean.loc[:, 'trans_speed'] = df_clean.loc[:, 'trans'].map(trans_speed_conv)
df_clean.loc[:, 'trans'] = df_clean.loc[:, 'trans'].map(trans_conv)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)


In [20]:
# Convert 'drive' feature to int categorical
drive_conv = {'2WD': 0, '4WD': 1}
df_clean.loc[:, 'drive'] = df_clean.loc[:, 'drive'].map(drive_conv)

In [21]:
# Convert 'fuel' feature to int categorical
fuel_conv = {'Gasoline': 0, 'eGas': 0, 'Diesel': 1, 'Ethanol': 2, 'CNG': 3}
df_clean.loc[:, 'fuel'] = df_clean.loc[:, 'fuel'].map(fuel_conv)

In [22]:
# Convert 'cert_region' feature to int categorical
cert_region_conv = {'FA': 0, 'CA': 1}
df_clean.loc[:, 'cert_region'] = df_clean.loc[:, 'cert_region'].map(cert_region_conv)

In [23]:
# Convert 'stnd' feature to int categorical
stnd_conv = {'B2': 0
             , 'B3': 1
             , 'B4': 2
             , 'B5': 3
             , 'B6': 4
             , 'B8': 5
             , 'L2': 6
             , 'L2ULEV125': 7
             , 'L3LEV160': 8
             , 'L3SULEV30': 9
             , 'L3SULEV30/PZEV': 10
             , 'L3ULEV125': 11
             , 'L3ULEV70': 12
             , 'PZEV': 13
             , 'S2': 14
             , 'T3B110': 15
             , 'T3B125': 16
             , 'T3B30': 17
             , 'T3B70': 18
             , 'T3B85': 19
             , 'U2': 20}
df_clean.loc[:, 'stnd'] = df_clean.loc[:, 'stnd'].map(stnd_conv)

In [24]:
# Convert 'veh_class' feature to int categorical
veh_class_conv = {'small car': 0
           , 'small SUV': 1
           , 'midsize car': 2
           , 'large car': 3
           , 'standard SUV': 4
           , 'station wagon': 5
           , 'special purpose': 6
           , 'pickup': 7
           , 'van': 8
           , 'minivan': 9}
df_clean.loc[:, 'veh_class'] = df_clean.loc[:, 'veh_class'].map(veh_class_conv)

In [25]:
df_clean.dropna(inplace=True)
# Setting new index
df_clean.index = range(2124)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [26]:
# Converting to correct types
df_clean[['greenhouse_gas_score', 'city_mpg', 'hwy_mpg', 'cmb_mpg', 'comb_co2']] = df_clean[['greenhouse_gas_score', 'city_mpg', 'hwy_mpg', 'cmb_mpg', 'comb_co2']].astype(float64)
df_clean[['trans', 'trans_speed', 'drive', 'fuel', 'cert_region', 'stnd', 'veh_class']] = df_clean[['trans', 'trans_speed', 'drive', 'fuel', 'cert_region', 'stnd', 'veh_class']].astype(int64)

### Merge in auto scraped MotorTrend features

In [27]:
# Reading in json to check NaN reasoning
with open('data/motortrend_specs_2015.json', 'r') as fp:
    s_temp = json.load(fp)

# Creating df of new features
user_ids = []
frames = []

for user_id, d in s_temp.iteritems():
    user_ids.append(user_id)
    frames.append(pd.DataFrame.from_dict(d, orient='index'))

s_temp = pd.concat(frames, keys=user_ids)
s_temp['model'] = zip(s_temp.index.get_level_values(0), s_temp.index.get_level_values(1))
s_temp['model'] = s_temp['model'].apply(lambda x: x[0] + ' ' + x[1])
s_temp = s_temp.reset_index(level=1, drop=True)

# Left_Outer join of df_clean and new features
df_combo = df_clean.merge(s_temp, how='left', left_on='model', right_on='model')

In [28]:
# Set correct values
df_combo['weight'] = df_combo[df_combo['weight'].notnull()]['weight'].apply(lambda x: x[:-5])
df_combo[df_combo['weight'] == ''] = np.nan
df_combo['msrp'] = df_combo[df_combo['msrp'].notnull()]['msrp'].apply(lambda x: int(x.replace(',', '')))

# Convert 'fuel_type' feature to int categorical
fuel_type_conv = {'Unleaded Regular': 0
           , 'Unleaded Midgrade': 1
           , 'Unleaded Premium': 2
           , 'Diesel': 3}
df_combo.loc[:, 'fuel_type'] = df_combo.loc[:, 'fuel_type'].map(fuel_type_conv)

# # Set NaNs to median/most common
# d_w = df_combo['weight'].median()
# d_t = df_combo['torque'].median()
# d_tr = df_combo['torque_rpm'].median()
# d_h = df_combo['horsepower'].median()
# d_m = df_combo['msrp'].median()
# d_f = df_combo['fuel_type'].value_counts().idxmax()

df_combo.ix[1090:1091] = \
   df_clean.ix[1090:1091]
    
# d = {'weight': d_w, 'torque': d_t, 'torque_rpm': d_tr, 'horsepower': d_h, 'msrp': d_m, 'fuel_type': d_f}
# df_combo.loc[:, ['weight', 'torque', 'torque_rpm', 'horsepower', 'msrp', 'fuel_type']] = \
#     df_combo.loc[:, ['weight', 'torque', 'torque_rpm', 'horsepower', 'msrp', 'fuel_type']].fillna(d)

# # Converting to correct types
# df_combo[['weight', 'torque', 'torque_rpm', 'horsepower', 'msrp', 'fuel_type']] = \
#     df_combo[['weight', 'torque', 'torque_rpm', 'horsepower', 'msrp', 'fuel_type']].astype(int64)



# Optional minimal 'unique' list

In [29]:
df_min = df_combo.drop_duplicates(subset=['model', 'displ', 'cyl', 'trans', 'drive', 'fuel'],keep='first')

TypeError: drop_duplicates() got an unexpected keyword argument 'keep'

# Fixes issues with missing values that should be filled

In [None]:
# Open link dict locally
with open('data/motortrend_links.json', 'r') as fp3:
    linked_dict = json.load(fp3)

# Open link json locally
with open('data/motortrend_specs_2015_leftovers_v2.json', 'r') as fp4:
    whole_fill = json.load(fp4)
    
    

for key2 in linked_dict.keys():
    new_link2 = linked_dict[key2]

    if new_link2[-1] == '/':
        link_complete2 = new_link2 + '2015/specifications/'
    else:
        link_complete2 = new_link2
    

    if whole_fill[key2][link_complete2] == 'Error':
            df_combo.loc[df_combo['model'] == key2, 'msrp'] = np.nan
            df_combo.loc[df_combo['model'] == key2, 'fuel_type'] = np.nan
            df_combo.loc[df_combo['model'] == key2, 'weight'] = np.nan
            df_combo.loc[df_combo['model'] == key2, 'torque'] = np.nan
            df_combo.loc[df_combo['model'] == key2, 'torque_rpm'] = np.nan
            df_combo.loc[df_combo['model'] == key2, 'horsepower'] = np.nan
    else:
        soup2 = bs4.BeautifulSoup(whole_fill[key2][link_complete2], 'html.parser')

        lines_price = soup2.find_all('span')
        for line in lines_price:
            if line.get('itemprop') != None:
                if line.get('itemprop') == 'price':
                    df_combo.loc[df_combo['model'] == key2, 'msrp'] = str(line.string)
                if line.get('itemprop') == 'fuelType':
                    df_combo.loc[df_combo['model'] == key2, 'fuel_type'] = str(line.string)
        
        lines_weight = soup2.find_all('div', attrs={'class': 'key'})
        for line in lines_weight:
            if line.string == 'Curb Weight':
                df_combo.loc[df_combo['model'] == key2, 'weight'] = str(line.next.next.string)
            if line.string == 'Torque':
                df_combo.loc[df_combo['model'] == key2, 'torque'] =  str(line.next.next.string)
            if line.string == 'Torque (rpm)':
                df_combo.loc[df_combo['model'] == key2, 'torque_rpm'] =  str(line.next.next.string)
            if line.string == 'Horsepower':
                if '@' not in line.next.next.string:
                    df_combo.loc[df_combo['model'] == key2, 'horsepower'] = str(line.next.next.string)

In [None]:
# Setting correct Lamborghini Huracan weight
df_combo.loc[1090:1091, 'weight'] = '3135'

# Correcting cleaning weight's strings
df_combo.loc[:, 'weight'] = df_combo.loc[:, 'weight'].apply(lambda x: str(x)[:4])

# Dropping fuel_type
df_combo.drop('fuel_type', axis=1, inplace=True)

# Dropping NaNs
df_combo.dropna(inplace=True)

# Cleaning artifacts from msrp's price
df_combo['msrp'] = df_combo[df_combo['msrp'].notnull()]['msrp'].apply(lambda x: str(x).replace(',', ''))
df_combo['msrp'] = df_combo[df_combo['msrp'].notnull()]['msrp'].apply(lambda x: int(str(x).replace('.0', '')))

# Convert new values to floats
df_combo[['weight', 'torque', 'torque_rpm', 'horsepower', 'msrp']] = \
    df_combo[['weight', 'torque', 'torque_rpm', 'horsepower', 'msrp']].astype(float64)

In [None]:
# Save filled df to local pickle
with open('data/df_combo_manfill_final.pkl', 'w') as fp5:
    pickle.dump(df_combo, fp5)

# Fill in auto scrape NaNs with manual scrape df

In [None]:
# #Uncomment if saving new dataframe

# # Open link dict locally
# with open('data/motortrend_links.json', 'r') as fp3:
#     linked_dict = json.load(fp3)

# # Open link json locally
# with open('data/motortrend_specs_2015_leftovers_v2.json', 'r') as fp4:
#     whole_fill = json.load(fp4)
    
    

# for key2 in linked_dict.keys():
#     new_link2 = linked_dict[key2]

#     if new_link2[-1] == '/':
#         link_complete2 = new_link2 + '2015/specifications/'
#     else:
#         link_complete2 = new_link2
    

#     if whole_fill[key2][link_complete2] == 'Error':
#             df_combo.loc[df_combo['model'] == key2, 'msrp'] = np.nan
#             df_combo.loc[df_combo['model'] == key2, 'fuel_type'] = np.nan
#             df_combo.loc[df_combo['model'] == key2, 'weight'] = np.nan
#             df_combo.loc[df_combo['model'] == key2, 'torque'] = np.nan
#             df_combo.loc[df_combo['model'] == key2, 'torque_rpm'] = np.nan
#             df_combo.loc[df_combo['model'] == key2, 'horsepower'] = np.nan
#     else:
#         soup2 = bs4.BeautifulSoup(whole_fill[key2][link_complete2], 'html.parser')
#         time.sleep(2)

#         lines_price = soup2.find_all('span')
#         for line in lines_price:
#             if line.get('itemprop') != None:
#                 if line.get('itemprop') == 'price':
#                     df_combo.loc[df_combo['model'] == key2, 'msrp'] = line.string
#                 if line.get('itemprop') == 'fuelType':
#                     df_combo.loc[df_combo['model'] == key2, 'fuel_type'] = line.string
        
#         lines_weight = soup2.find_all('div', attrs={'class': 'key'})
#         for line in lines_weight:
#             if line.string == 'Curb Weight':
#                 df_combo.loc[df_combo['model'] == key2, 'weight'] = line.next.next.string
#             if line.string == 'Torque':
#                 df_combo.loc[df_combo['model'] == key2, 'torque'] =  line.next.next.string
#             if line.string == 'Torque (rpm)':
#                 df_combo.loc[df_combo['model'] == key2, 'torque_rpm'] =  line.next.next.string
#             if line.string == 'Horsepower':
#                 if '@' not in line.next.next.string:
#                     df_combo.loc[df_combo['model'] == key2, 'horsepower'] = line.next.next.string

In [None]:
# # Uncomment if saving new dataframe

# # Set correct values
# df_combo['weight'] = df_combo[df_combo['weight'].notnull()]['weight'].apply(lambda x: x[:-5])
# df_combo[df_combo['weight'] == ''] = np.nan
# df_combo['msrp'] = df_combo[df_combo['msrp'].notnull()]['msrp'].apply(lambda x: int(x.replace(',', '')))

# # Convert 'fuel_type' feature to int categorical
# fuel_type_conv = {'Unleaded Regular': 0
#            , 'Unleaded Midgrade': 1
#            , 'Unleaded Premium': 2
#            , 'Diesel': 3}
# df_combo.loc[:, 'fuel_type'] = df_combo.loc[:, 'fuel_type'].map(fuel_type_conv)

# # Converting to correct types
# df_combo[['weight', 'torque', 'torque_rpm', 'horsepower', 'msrp', 'fuel_type']] = \
#     df_combo[['weight'

In [None]:
# # Uncomment if saving new dataframe

# # Save filled df to local pickle
# with open('data/df_combo_manfill_manfill2.pkl', 'w') as fp5:
#     pickle.dump(df_combo, fp5)

## Optional Volkswagen removal

In [None]:
# Open combo manfill file
with open('data/df_combo_manfill_manfill2.pkl', 'r') as fp6:
    df_combo_loaded = pickle.load(fp6)

# Drop NaNs
df_combo_slim = df_combo_loaded.dropna()

# Select only affected cars
df_volk = df_combo_slim[(df_combo_slim['model'] == 'VOLKSWAGEN Jetta') | \
    (df_combo_slim['model'] == 'VOLKSWAGEN Beetle') | \
    (df_combo_slim['model'] == 'VOLKSWAGEN Beetle Convertible') | \
    (df_combo_slim['model'] == 'AUDI A3') | \
    (df_combo_slim['model'] == 'VOLKSWAGEN Golf') | \
    (df_combo_slim['model'] == 'VOLKSWAGEN Golf SportWagen') | \
    (df_combo_slim['model'] == 'VOLKSWAGEN Passat') | \
    (df_combo_slim['model'] == 'PORSCHE Cayenne')]

# Select only diesel
df_volk = df_volk[df_volk['fuel'] == 1]
df_volk_train = df_combo_slim.drop(df_volk.index)

# # Copy for random test
# df_combo_slim_copy = df_combo_slim.copy()

# Creating y variables and dropping them from test set
y_volk_air = df_volk['air_pollution_score']
y_volk_greenhouse = df_volk['greenhouse_gas_score']
df_volk.drop(df_volk[['air_pollution_score', 'greenhouse_gas_score']], axis=1, inplace=True)

# Creating y variables and dropping them from training set
y_airpollution = df_volk_train['air_pollution_score']
y_greenhouse = df_volk_train['greenhouse_gas_score']
df_volk_train.drop(df_volk_train[['air_pollution_score', 'greenhouse_gas_score']], axis=1, inplace=True)

# Selecting columns for test set
df_volk_select = df_volk[['displ'
                      , 'cyl'
                      , 'trans'
                      , 'drive'
                      , 'fuel'
                      , 'veh_class'
                      , 'cert_region'
                      , 'trans_speed'
                      , 'weight'
                      , 'torque'
                      , 'torque_rpm'
                      , 'horsepower'
                      , 'msrp'
                      , 'city_mpg'
                      , 'hwy_mpg'
                      , 'cmb_mpg']]

# Selecting columns for training set
df_select = df_volk_train[['displ'
                      , 'cyl'
                      , 'trans'
                      , 'drive'
                      , 'fuel'
                      , 'veh_class'
                      , 'cert_region'
                      , 'trans_speed'
                      , 'weight'
                      , 'torque'
                      , 'torque_rpm'
                      , 'horsepower'
                      , 'msrp'
                      , 'city_mpg'
                      , 'hwy_mpg'
                      , 'cmb_mpg']]

In [None]:
# Random Forest Classifier
rfc_volk = RandomForestClassifier()
rfc_volk2 = RandomForestClassifier()
rfc_volk_air = rfc_volk.fit(df_select, y_airpollution)
rfc_volk_green = rfc_volk2.fit(df_select, y_greenhouse)

air_pred = rfc_volk_air.predict(df_volk_select)
print 'Volkswagon air prediction:', air_pred
print 'Volkswagon air actual:', y_volk_air.values

green_pred = rfc_volk_green.predict(df_volk_select)
print 'Volkswagon green prediction:', green_pred
print 'Volkswagon green actual:', y_volk_greenhouse.values

print 'Difference in air score:',sum(air_pred-y_volk_air.values)
print 'Difference in green score:',sum(green_pred-y_volk_greenhouse.values)

# Save model in pickle file
with open('data/model_volk_air15.pkl', 'w') as f:
        pickle.dump(rfc_volk, f)
with open('data/model_volk_green15.pkl', 'w') as f:
        pickle.dump(rfc_volk, f)


# print 'Air Pollution Score:', rfc_volk_air.score(X_airpollution_test, y_airpollution_test)
# print 'Air importances:', rfc_air.feature_importances_
# air_pred = rfc_air.predict(X_airpollution_test)


# print 'Air Pollution Precision:', precision_score(y_airpollution_test, air_pred)
# print 'Air Pollution Recall:', recall_score(y_airpollution_test, air_pred)
# print 'Air Pollution f1(micro):', f1_score(y_airpollution_test, air_pred, average='micro')
# print 'Air Pollution f1(macro):', f1_score(y_airpollution_test, air_pred, average='macro')

# print 'Greenhouse Score:', rfc_green.score(X_greenhouse_test, y_greenhouse_test)
# #print 'Green importances:', rfc_green.feature_importances_
# green_pred = rfc_green.predict(X_greenhouse_test)
# print 'Greenhouse Precision:', precision_score(y_greenhouse_test, green_pred)
# print 'Greenhouse Recall:', recall_score(y_greenhouse_test, green_pred)
# print 'Greenhouse f1(micro):', f1_score(y_greenhouse_test, green_pred, average='micro')
# print 'Greenhouse f1(macro):', f1_score(y_greenhouse_test, green_pred, average='macro')

In [None]:
# GradientBoostingClassifier
gradc_volk = GradientBoostingClassifier(min_samples_leaf= 3, n_estimators= 1300,\
                                   min_samples_split= 1, random_state= 1,\
                                   max_features= 'sqrt', max_depth= 3)
gradc2_volk = GradientBoostingClassifier()
gradc_volk_air = gradc_volk.fit(df_select, y_airpollution)
gradc_volk_green = gradc2_volk.fit(df_select, y_greenhouse)

air_pred = gradc_volk_air.predict(df_volk_select)
print 'Volkswagon air prediction:', air_pred
print 'Volkswagon air actual:', y_volk_air.values

green_pred = gradc_volk_green.predict(df_volk_select)
print 'Volkswagon green prediction:', green_pred
print 'Volkswagon green actual:', y_volk_greenhouse.values

print 'Difference in air score:',sum(air_pred-y_volk_air.values)
print 'Difference in green score:',sum(green_pred-y_volk_greenhouse.values)

In [None]:
print 'Air t-stat, p-value:', ttest_rel(air_pred, y_volk_air.values)
print 'Green t-stat, p-value:', ttest_rel(green_pred, y_volk_greenhouse.values)

In [None]:
# Random selection comparison
# Creating y variables and dropping them from test set
y_rand_air = df_combo_slim['air_pollution_score']
y_rand_green = df_combo_slim['greenhouse_gas_score']
df_rand = df_combo_slim.drop(df_combo_slim[['air_pollution_score', 'greenhouse_gas_score']], axis=1)

# Selecting columns for model
df_rand_select = df_rand[['displ'
                              , 'cyl'
                              , 'trans'
                              , 'drive'
                              , 'fuel'
                              , 'veh_class'
                              , 'cert_region'
                              , 'trans_speed'
                              , 'weight'
                              , 'torque'
                              , 'torque_rpm'
                              , 'horsepower'
                              , 'msrp'
                              , 'fuel_type']]

# Test/Train split
X_randair_train, X_randair_test, y_randair_train, y_randair_test = train_test_split(df_rand_select, y_rand_air, random_state=42)
X_randgreen_train, X_randgreen_test, y_randgreen_train, y_randgreen_test = train_test_split(df_rand_select, y_rand_green, random_state=42)

# Random Forest Classifier
rfc_rand = RandomForestClassifier()
rfc_rand2 = RandomForestClassifier()
rfc_rand_air = rfc_rand.fit(X_randair_train, y_randair_train)
rfc_rand_green = rfc_rand2.fit(X_randgreen_train, y_randgreen_train)

air_pred = rfc_rand_air.predict(X_randair_test)
green_pred = rfc_rand_green.predict(X_randgreen_test)

print 'Difference in air score:',sum(air_pred-y_randair_test)
print 'Air t-stat, p-value:', ttest_ind(air_pred, y_randair_test, equal_var = False)
print 'Difference in green score:',sum(green_pred-y_randgreen_test)
print 'Green t-stat, p-value:', ttest_ind(air_pred, y_randair_test, equal_var = False)

## Testing if gas can predict diesel

In [None]:
# Open combo manfill file
with open('data/df_combo_manfill_manfill2.pkl', 'r') as fp6:
    df_combo_loaded = pickle.load(fp6)

# Drop NaNs
df_combo_slim = df_combo_loaded.dropna()

# Select only affected cars
# BMW Selection
# df_volk = df_combo_slim[(df_combo_slim['model'] == 'BMW 328d') | \
#     (df_combo_slim['model'] == 'BMW 535d') | \
#     (df_combo_slim['model'] == 'BMW 740Ld') | \
#     (df_combo_slim['model'] == 'BMW X3 xDrive28d') | \
#     (df_combo_slim['model'] == 'BMW 328d Sports Wagon')]

# VOLK Selection
df_volk = df_combo_slim[(df_combo_slim['model'] == 'VOLKSWAGEN Jetta') | \
    (df_combo_slim['model'] == 'VOLKSWAGEN Beetle') | \
    (df_combo_slim['model'] == 'VOLKSWAGEN Beetle Convertible') | \
    (df_combo_slim['model'] == 'AUDI A3') | \
    (df_combo_slim['model'] == 'VOLKSWAGEN Golf') | \
    (df_combo_slim['model'] == 'VOLKSWAGEN Golf SportWagen') | \
    (df_combo_slim['model'] == 'VOLKSWAGEN Passat') | \
    (df_combo_slim['model'] == 'PORSCHE Cayenne')]

# Select only diesel
df_diesel = df_volk[df_volk['fuel'] == 1]
df_diesel_train = df_combo_slim.drop(df_diesel.index)
df_diesel_train = df_diesel_train[df_diesel_train['fuel']==1]

# # Copy for random test
# df_combo_slim_copy = df_combo_slim.copy()

# Creating y variables and dropping them from test set
y_diesel_air = df_diesel['air_pollution_score']
y_diesel_greenhouse = df_diesel['greenhouse_gas_score']
df_diesel.drop(df_diesel[['air_pollution_score', 'greenhouse_gas_score']], axis=1, inplace=True)

# Creating y variables and dropping them from training set
y_airpollution = df_diesel_train['air_pollution_score']
y_greenhouse = df_diesel_train['greenhouse_gas_score']
df_diesel_train.drop(df_diesel_train[['air_pollution_score', 'greenhouse_gas_score']], axis=1, inplace=True)

# Selecting columns for test set
df_diesel_select = df_diesel[['displ'
                      , 'cyl'
                      , 'trans'
                      , 'drive'
                      , 'veh_class'
                      , 'fuel'
                      , 'cert_region'
                      , 'trans_speed'
                      , 'weight'
                      , 'torque'
                      , 'torque_rpm'
                      , 'horsepower'
                      , 'msrp'
                      , 'city_mpg'
                      , 'hwy_mpg'
                      , 'cmb_mpg']]

# Selecting columns for training set
df_select = df_diesel_train[['displ'
                      , 'cyl'
                      , 'trans'
                      , 'drive'
                      , 'veh_class'
                      , 'fuel'
                      , 'cert_region'
                      , 'trans_speed'
                      , 'weight'
                      , 'torque'
                      , 'torque_rpm'
                      , 'horsepower'
                      , 'msrp'
                      , 'city_mpg'
                      , 'hwy_mpg'
                      , 'cmb_mpg']]

In [None]:
# GradientBoostingClassifier
# Tuning parameters for full model
# min_samples_leaf= 3, n_estimators= 1300,\
#                                    min_samples_split= 1, random_state= 1,\
#                                    max_features= 'sqrt', max_depth= 3

gradc_diesel = GradientBoostingClassifier()
gradc2_diesel = GradientBoostingClassifier()
gradc_diesel_air = gradc_diesel.fit(df_select, y_airpollution)
gradc_diesel_green = gradc2_diesel.fit(df_select, y_greenhouse)

air_pred = gradc_diesel_air.predict(df_diesel_select)
print 'Volkswagon air prediction:', air_pred
print 'Volkswagon air actual:', y_diesel_air.values

green_pred = gradc_diesel_green.predict(df_diesel_select)
print 'Volkswagon green prediction:', green_pred
print 'Volkswagon green actual:', y_diesel_greenhouse.values

print 'Difference in air score:',sum(air_pred-y_diesel_air.values)
print 'Difference in green score:',sum(green_pred-y_diesel_greenhouse.values)

In [None]:
print 'Air t-stat, p-value:', ttest_ind(air_pred, y_diesel_air.values, equal_var = False)
print 'Green t-stat, p-value:', ttest_ind(green_pred, y_diesel_greenhouse.values, equal_var = False)

Findings:
Gasoline cars cannot predict diesel cars.
Diesel in can predict BMW diesels well.
Diesel only model can accurately predict Volks cars.
Based on the histograms below, I cannot accurately predict diesel cars. The data does not have enough bins/diesel power to do so.

### Looking at variance of diesel scores and gasoline scores

In [None]:
# Open combo manfill file
with open('data/df_combo_manfill_manfill2.pkl', 'r') as fp6:
    df_combo_loaded = pickle.load(fp6)

# Drop NaNs
df_compare = df_combo_loaded.dropna()

df_compare_g = df_compare[df_compare['fuel']==0]
df_compare_d = df_compare[df_compare['fuel']==1]
df_compare_g.index = range(1054)
df_compare_d.index = range(40)


f, ax = plt.subplots(figsize=(10,5))
plt.subplot(1, 2, 1)
sns.distplot(df_compare_g['air_pollution_score'])
plt.subplot(1, 2, 2)
sns.distplot(df_compare_d['air_pollution_score'])
pass

# Model for presentation on Volk gasoline cars

In [3]:
# Open combo manfill file
with open('data/df_combo_manfill_final.pkl', 'r') as fp3:
    df_combo_loaded3 = pickle.load(fp3)

In [15]:
df_combo_loaded3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2023 entries, 0 to 2123
Data columns (total 24 columns):
model                   2023 non-null object
displ                   2023 non-null float64
cyl                     2023 non-null float64
trans                   2023 non-null float64
drive                   2023 non-null float64
fuel                    2023 non-null float64
cert_region             2023 non-null float64
stnd                    2023 non-null float64
stnd_description        2023 non-null object
underhood_id            2023 non-null object
veh_class               2023 non-null float64
air_pollution_score     2023 non-null float64
city_mpg                2023 non-null float64
hwy_mpg                 2023 non-null float64
cmb_mpg                 2023 non-null float64
greenhouse_gas_score    2023 non-null float64
smartway                2023 non-null float64
comb_co2                2023 non-null float64
trans_speed             2023 non-null float64
weight                

In [None]:
len(df_combo[df_combo['fuel']==1]['model'].unique())

In [8]:
df_combo_slim[df_combo_slim['fuel']==1]['model'].unique()

array(['AUDI A3', 'AUDI A6', 'AUDI A7', 'AUDI A8 L', 'AUDI Q5', 'AUDI Q7',
       'BMW 328d', 'BMW 328d Sports Wagon', 'BMW 535d', 'BMW 740Ld',
       'BMW X3 xDrive28d', 'BMW X5', 'CHEVROLET Cruze',
       'JEEP Grand Cherokee', 'MERCEDES-BENZ E250 Bluetec',
       'MERCEDES-BENZ E250 Bluetec 4Matic',
       'MERCEDES-BENZ GL350 Bluetec 4Matic',
       'MERCEDES-BENZ GLK250 Bluetec 4Matic',
       'MERCEDES-BENZ ML250 Bluetec 4Matic', 'PORSCHE Cayenne', 'RAM 1500',
       'RAM 1500 HFE', 'VOLKSWAGEN Beetle',
       'VOLKSWAGEN Beetle Convertible', 'VOLKSWAGEN Golf',
       'VOLKSWAGEN Golf SportWagen', 'VOLKSWAGEN Jetta',
       'VOLKSWAGEN Passat', 'VOLKSWAGEN Touareg'], dtype=object)

In [76]:
df_combo_slim[df_combo_slim['model'].isin(['VOLKSWAGEN Beetle'])]

Unnamed: 0,model,displ,cyl,trans,drive,fuel,cert_region,stnd,stnd_description,underhood_id,...,cmb_mpg,greenhouse_gas_score,smartway,comb_co2,trans_speed,weight,torque,torque_rpm,horsepower,msrp
1971,VOLKSWAGEN Beetle,1.8,4,1,0,0,0,3,Federal Tier 2 Bin 5,FVGAV02.0VBD,...,27,7,0,326,5,2987,184,1500,170,20195
1972,VOLKSWAGEN Beetle,1.8,4,1,0,0,0,1,Federal Tier 2 Bin 3,FVGAV02.0VPD,...,27,7,1,326,5,2987,184,1500,170,20195
1973,VOLKSWAGEN Beetle,1.8,4,1,0,0,1,13,California PZEV,FVGAV02.0VPD,...,27,7,1,326,5,2987,184,1500,170,20195
1974,VOLKSWAGEN Beetle,1.8,4,0,0,0,0,3,Federal Tier 2 Bin 5,FVGAV02.0VBD,...,28,7,0,318,6,2987,184,1500,170,20195
1975,VOLKSWAGEN Beetle,1.8,4,0,0,0,0,1,Federal Tier 2 Bin 3,FVGAV02.0VPD,...,28,7,1,318,6,2987,184,1500,170,20195
1976,VOLKSWAGEN Beetle,1.8,4,0,0,0,1,13,California PZEV,FVGAV02.0VPD,...,28,7,1,318,6,2987,184,1500,170,20195
1977,VOLKSWAGEN Beetle,2.0,4,0,0,1,0,3,Federal Tier 2 Bin 5,FVGAV02.0VAL,...,34,7,0,295,6,2987,184,1500,170,20195
1978,VOLKSWAGEN Beetle,2.0,4,0,0,1,1,11,California LEV-III ULEV125,FVGAV02.0VAL,...,34,7,1,295,6,2987,184,1500,170,20195
1979,VOLKSWAGEN Beetle,2.0,4,0,0,0,0,3,Federal Tier 2 Bin 5,FVGAV02.0VBD,...,26,6,0,343,6,2987,184,1500,170,20195
1980,VOLKSWAGEN Beetle,2.0,4,0,0,0,0,1,Federal Tier 2 Bin 3,FVGAV02.0VPD,...,26,6,0,343,6,2987,184,1500,170,20195


In [80]:
df_combo_slim.ix[1971:2077]

Unnamed: 0,model,displ,cyl,trans,drive,fuel,cert_region,stnd,stnd_description,underhood_id,...,cmb_mpg,greenhouse_gas_score,smartway,comb_co2,trans_speed,weight,torque,torque_rpm,horsepower,msrp
1971,VOLKSWAGEN Beetle,1.8,4,1,0,0,0,3,Federal Tier 2 Bin 5,FVGAV02.0VBD,...,27,7,0,326,5,2987,184,1500,170,20195
1972,VOLKSWAGEN Beetle,1.8,4,1,0,0,0,1,Federal Tier 2 Bin 3,FVGAV02.0VPD,...,27,7,1,326,5,2987,184,1500,170,20195
1973,VOLKSWAGEN Beetle,1.8,4,1,0,0,1,13,California PZEV,FVGAV02.0VPD,...,27,7,1,326,5,2987,184,1500,170,20195
1974,VOLKSWAGEN Beetle,1.8,4,0,0,0,0,3,Federal Tier 2 Bin 5,FVGAV02.0VBD,...,28,7,0,318,6,2987,184,1500,170,20195
1975,VOLKSWAGEN Beetle,1.8,4,0,0,0,0,1,Federal Tier 2 Bin 3,FVGAV02.0VPD,...,28,7,1,318,6,2987,184,1500,170,20195
1976,VOLKSWAGEN Beetle,1.8,4,0,0,0,1,13,California PZEV,FVGAV02.0VPD,...,28,7,1,318,6,2987,184,1500,170,20195
1977,VOLKSWAGEN Beetle,2.0,4,0,0,1,0,3,Federal Tier 2 Bin 5,FVGAV02.0VAL,...,34,7,0,295,6,2987,184,1500,170,20195
1978,VOLKSWAGEN Beetle,2.0,4,0,0,1,1,11,California LEV-III ULEV125,FVGAV02.0VAL,...,34,7,1,295,6,2987,184,1500,170,20195
1979,VOLKSWAGEN Beetle,2.0,4,0,0,0,0,3,Federal Tier 2 Bin 5,FVGAV02.0VBD,...,26,6,0,343,6,2987,184,1500,170,20195
1980,VOLKSWAGEN Beetle,2.0,4,0,0,0,0,1,Federal Tier 2 Bin 3,FVGAV02.0VPD,...,26,6,0,343,6,2987,184,1500,170,20195


In [81]:
df_combo_slim.ix[1971:2077][df_combo_slim['fuel']==0]['model'].unique()

array(['VOLKSWAGEN Beetle', 'VOLKSWAGEN Beetle Convertible',
       'VOLKSWAGEN CC', 'VOLKSWAGEN CC 4Motion', 'VOLKSWAGEN Eos',
       'VOLKSWAGEN GTI', 'VOLKSWAGEN Golf', 'VOLKSWAGEN Golf R',
       'VOLKSWAGEN Golf SportWagen', 'VOLKSWAGEN Jetta',
       'VOLKSWAGEN Jetta Hybrid', 'VOLKSWAGEN Passat', 'VOLKSWAGEN Tiguan',
       'VOLKSWAGEN Tiguan 4Motion', 'VOLKSWAGEN Touareg',
       'VOLKSWAGEN Touareg Hybrid'], dtype=object)

In [7]:
# Open combo manfill file
with open('data/df_combo_manfill_final.pkl', 'r') as fp6:
    df_combo_loaded = pickle.load(fp6)

# Drop NaNs
df_combo_slim = df_combo_loaded.dropna()


# # Dodge/Jeep Selection
# df_volk = df_combo_slim[(df_combo_slim['model'] == 'DODGE Challenger SRT8') | \
#     (df_combo_slim['model'] == 'DODGE Charger SRT8') | \
#     (df_combo_slim['model'] == 'DODGE Dart Aero') | \
#     (df_combo_slim['model'] == 'DODGE Viper SRT') | \
#     (df_combo_slim['model'] == 'JEEP Cherokee 4x4 Active Drive II') | \
#     (df_combo_slim['model'] == 'JEEP Cherokee Trailhawk') | \
#     (df_combo_slim['model'] == 'JEEP Grand Cherokee SRT8')]


# # Mercedes-Benz diesel Selection
# df_volk = df_combo_slim[(df_combo_slim['model'] == 'MERCEDES-BENZ E250 Bluetec') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ E250 Bluetec 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ GL350 Bluetec 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ GLK250 Bluetec 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ ML250 Bluetec 4Matic')]


# # Mercedes-Benz gas Selection
# df_volk = df_combo_slim[(df_combo_slim['model'] == 'MERCEDES-BENZ E350 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ E350 4Matic Wagon') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ E400 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ E400 4Matic Wagon') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ E63 AMG 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ GL450 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ GL550 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ GLA250 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ GLA45 AMG 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ GLK350 4Matic')]


# # BMW Selection
# df_volk = df_combo_slim[(df_combo_slim['model'] == 'BMW 328i') | \
#     (df_combo_slim['model'] == 'BMW 535i') | \
#     (df_combo_slim['model'] == 'BMW 740Li') | \
#     (df_combo_slim['model'] == 'BMW X3 xDrive28i') | \
#     (df_combo_slim['model'] == 'BMW 328i Sports Wagon')]


# # VOLK Selection
# df_volk = df_combo_slim[(df_combo_slim['model'] == 'VOLKSWAGEN Jetta') | \
#     (df_combo_slim['model'] == 'VOLKSWAGEN Beetle') | \
#     (df_combo_slim['model'] == 'VOLKSWAGEN Beetle Convertible') | \
#     (df_combo_slim['model'] == 'AUDI A3') | \
#     (df_combo_slim['model'] == 'VOLKSWAGEN Golf') | \
#     (df_combo_slim['model'] == 'VOLKSWAGEN Golf SportWagen') | \
#     (df_combo_slim['model'] == 'VOLKSWAGEN Passat') | \
#     (df_combo_slim['model'] == 'PORSCHE Cayenne')]


# # # VOLKgas Selection
# df_volk = df_combo_slim[(df_combo_slim['model'] == 'VOLKSWAGEN Beetle Convertible') | \
#     (df_combo_slim['model'] == 'VOLKSWAGEN Golf R') | \
#     (df_combo_slim['model'] == 'VOLKSWAGEN Golf SportWagen') | \
#     (df_combo_slim['model'] == 'VOLKSWAGEN Jetta Hybrid') | \
#     (df_combo_slim['model'] == 'AUDI A3 Cabriolet') | \
#     (df_combo_slim['model'] == 'PORSCHE Cayenne S')]


# # VOLKtot Selection
# df_volk = df_combo_slim[(df_combo_slim['model'] == 'VOLKSWAGEN Beetle Convertible') | \
#     (df_combo_slim['model'] == 'VOLKSWAGEN CC 4Motion') | \
#     (df_combo_slim['model'] == 'VOLKSWAGEN Golf R') | \
#     (df_combo_slim['model'] == 'VOLKSWAGEN Golf SportWagen') | \
#     (df_combo_slim['model'] == 'VOLKSWAGEN Jetta Hybrid') | \
#     (df_combo_slim['model'] == 'VOLKSWAGEN Tiguan 4Motion') | \
#     (df_combo_slim['model'] == 'VOLKSWAGEN Touareg Hybrid') | \
#     (df_combo_slim['model'] == 'AUDI A3 Cabriolet') | \
#     (df_combo_slim['model'] == 'AUDI A5 Cabriolet') | \
#     (df_combo_slim['model'] == 'AUDI A8 L') | \
#     (df_combo_slim['model'] == 'AUDI R8 Spyder') | \
#     (df_combo_slim['model'] == 'AUDI RS5') | \
#     (df_combo_slim['model'] == 'AUDI RS5 Cabriolet') | \
#     (df_combo_slim['model'] == 'AUDI RS7') | \
#     (df_combo_slim['model'] == 'AUDI S5 Cabriolet') | \
#     (df_combo_slim['model'] == 'AUDI TT Coupe') | \
#     (df_combo_slim['model'] == 'AUDI TT Roadster') | \
#     (df_combo_slim['model'] == 'AUDI allroad quattro') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 Carrera') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 Carrera 4') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 Carrera 4 Cabriolet') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 Carrera 4 GTS') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 Carrera 4 GTS Cabriolet') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 Carrera 4S') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 Carrera 4S Cabriolet') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 Carrera Cabriolet') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 Carrera GTS') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 Carrera GTS Cabriolet') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 Carrera S') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 Carrera S Cabriolet') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 GT3') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 Targa 4') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 Targa 4S') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 Turbo') | \
#     (df_combo_slim['model'] == 'PORSCHE 911 Turbo S') | \
#     (df_combo_slim['model'] == 'PORSCHE Boxster GTS') | \
#     (df_combo_slim['model'] == 'PORSCHE Boxster S') | \
#     (df_combo_slim['model'] == 'PORSCHE Cayenne S') | \
#     (df_combo_slim['model'] == 'PORSCHE Cayenne Turbo') | \
#     (df_combo_slim['model'] == 'PORSCHE Cayman GTS') | \
#     (df_combo_slim['model'] == 'PORSCHE Cayman S') | \
#     (df_combo_slim['model'] == 'PORSCHE Macan S') | \
#     (df_combo_slim['model'] == 'PORSCHE Macan Turbo') | \
#     (df_combo_slim['model'] == 'PORSCHE Panamera 4') | \
#     (df_combo_slim['model'] == 'PORSCHE Panamera 4S') | \
#     (df_combo_slim['model'] == 'PORSCHE Panamera 4S Executive') | \
#     (df_combo_slim['model'] == 'PORSCHE Panamera GTS') | \
#     (df_combo_slim['model'] == 'PORSCHE Panamera S') | \
#     (df_combo_slim['model'] == 'PORSCHE Panamera Turbo') | \
#     (df_combo_slim['model'] == 'PORSCHE Panamera Turbo Executive') | \
#     (df_combo_slim['model'] == 'PORSCHE Panamera Turbo S') | \
#     (df_combo_slim['model'] == 'PORSCHE Panamera Turbo S Executive')]

# Full Gas counterpart list
df_volk = df_combo_slim[df_combo_slim['model'].isin(\
        ['AUDI A3', 'AUDI A3 Cabriolet', \
       'AUDI A6', 'AUDI A7', 'AUDI A8', 'AUDI A8 L', \
       'AUDI Q5', 'AUDI Q5 Hybrid', 'AUDI Q7', 'BMW 328i', \
       'BMW 328i Gran Turismo', 'BMW 328i Sports Wagon', 'BMW 535i', \
       'BMW 535i Gran Turismo', 'BMW 740Li', 'BMW 740i', 'BMW X3 sDrive28i', \
       'BMW X3 xDrive28i', 'BMW X5', 'BMW X5 M', 'CHEVROLET Cruze', \
       'JEEP Grand Cherokee SRT8', 'PORSCHE Cayenne S', 'PORSCHE Cayenne Turbo', \
       'VOLKSWAGEN Beetle', 'VOLKSWAGEN Beetle Convertible', 'VOLKSWAGEN Golf', 'VOLKSWAGEN Golf R', \
       'VOLKSWAGEN Golf SportWagen', 'VOLKSWAGEN Jetta', 'VOLKSWAGEN Jetta Hybrid', \
       'VOLKSWAGEN Passat', 'VOLKSWAGEN Touareg', 'VOLKSWAGEN Touareg Hybrid'])]



# # Full diesel List
# df_volk = df_combo_slim[df_combo_slim['model'].isin(['AUDI A3', 'AUDI A6', 'AUDI A7', \
#         'AUDI A8 L', 'AUDI Q5', 'AUDI Q7', 'BMW 328d', 'BMW 328d Sports Wagon', 'BMW 535d', \
#         'BMW 740Ld', 'BMW X3 xDrive28d', 'BMW X5', 'CHEVROLET Cruze', 'JEEP Grand Cherokee', \
#         'MERCEDES-BENZ E250 Bluetec', 'MERCEDES-BENZ E250 Bluetec 4Matic', 'MERCEDES-BENZ GL350 Bluetec 4Matic', \
#         'MERCEDES-BENZ GLK250 Bluetec 4Matic', 'MERCEDES-BENZ ML250 Bluetec 4Matic', 'PORSCHE Cayenne', \
#         'RAM 1500', 'RAM 1500 HFE', 'VOLKSWAGEN Beetle', 'VOLKSWAGEN Beetle Convertible', 'VOLKSWAGEN Golf', \
#         'VOLKSWAGEN Golf SportWagen', 'VOLKSWAGEN Jetta', 'VOLKSWAGEN Passat', 'VOLKSWAGEN Touareg'])]


# # All diesel cars gas version Selection
# df_volk = df_combo_slim[(df_combo_slim['model'] == 'AUDI A8 L') | \
#     (df_combo_slim['model'] == 'BMW 328i') | \
#     (df_combo_slim['model'] == 'BMW 328i Sports Wagon') | \
#     (df_combo_slim['model'] == 'BMW 535i') | \
#     (df_combo_slim['model'] == 'BMW 740Li') | \
#     (df_combo_slim['model'] == 'BMW X3 xDrive28i') | \
#     (df_combo_slim['model'] == 'JEEP Grand Cherokee SRT8') | \
#     (df_combo_slim['model'] == 'VOLKSWAGEN Beetle Convertible') | \
#     (df_combo_slim['model'] == 'VOLKSWAGEN Golf SportWagen') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ GLK350 4Matic')]

# # Mercedes-Benz gas Selection
# df_volk = df_combo_slim[(df_combo_slim['model'] == 'MERCEDES-BENZ E350 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ E350 4Matic Wagon') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ E400 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ E400 4Matic Wagon') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ E63 AMG 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ GL450 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ GL550 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ GLA250 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ GLA45 AMG 4Matic') | \
#     (df_combo_slim['model'] == 'MERCEDES-BENZ GLK350 4Matic')]



# Select only gasoline
df_vgas = df_volk[df_volk['fuel'] == 0]
df_vgas = df_vgas[df_vgas['cert_region'] == 0]
# df_vgas = df_vgas[df_vgas['stnd'] != 1]
# df_vgas = df_vgas[(df_vgas['displ'] == 2.0) | (df_vgas['displ'] == 3.0)]
df_vgas_train = df_combo_slim.drop(df_vgas.index)

# # Copy for random test
# df_combo_slim_copy = df_combo_slim.copy()

# Creating y variables and dropping them from test set
y_vgas_air = df_vgas['air_pollution_score']
y_vgas_greenhouse = df_vgas['greenhouse_gas_score']
df_vgas.drop(df_vgas[['air_pollution_score', 'greenhouse_gas_score']], axis=1, inplace=True)

# Creating y variables and dropping them from training set
y_airpollution = df_vgas_train['air_pollution_score']
y_greenhouse = df_vgas_train['greenhouse_gas_score']
df_vgas_train.drop(df_vgas_train[['air_pollution_score', 'greenhouse_gas_score']], axis=1, inplace=True)

# Selecting columns for test set
df_vgas_select = df_vgas[['displ'
#                       , 'cyl'
#                       , 'trans'
#                       , 'drive'
#                       , 'veh_class'
#                       , 'fuel'
                      , 'cert_region'
                      , 'trans_speed'
                      , 'weight'
                      , 'torque'
                      , 'torque_rpm'
                      , 'horsepower'
                      , 'msrp'
                      , 'city_mpg'
                      , 'hwy_mpg'
                      , 'cmb_mpg']]

# Selecting columns for training set
df_select = df_vgas_train[['displ'
#                       , 'cyl'
#                       , 'trans'
#                       , 'drive'
#                       , 'veh_class'
#                       , 'fuel'
                      , 'cert_region'
                      , 'trans_speed'
                      , 'weight'
                      , 'torque'
                      , 'torque_rpm'
                      , 'horsepower'
                      , 'msrp'
                      , 'city_mpg'
                      , 'hwy_mpg'
                      , 'cmb_mpg']]

In [8]:
# GradientBoostingClassifier
# Tuning parameters for full model
gradc_vgas = GradientBoostingClassifier(min_samples_leaf= 3, n_estimators= 1300,\
                                   min_samples_split= 1, random_state= 1,\
                                   max_features= 'sqrt', max_depth= 3)
gradc2_vgas = GradientBoostingClassifier()
gradc_vgas_air = gradc_vgas.fit(df_select, y_airpollution)
gradc_vgas_green = gradc2_vgas.fit(df_select, y_greenhouse)

air_pred = gradc_vgas_air.predict(df_vgas_select)
air_actual = y_vgas_air.values
print 'Volkswagon air prediction:', air_pred
print 'Volkswagon air actual:', y_vgas_air.values

# green_pred = gradc_vgas_green.predict(df_vgas_select)
# print 'Volkswagon green prediction:', green_pred
# print 'Volkswagon green actual:', y_vgas_greenhouse.values

# print 'Difference in air score:',sum(air_pred-y_vgas_air.values)
# print 'Difference in green score:',sum(green_pred-y_vgas_greenhouse.values)

Volkswagon air prediction: [ 5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.
  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.
  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.
  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.
  9.  5.  5.  5.  5.  5.  5.  5.]
Volkswagon air actual: [ 9.  9.  9.  9.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.
  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.
  5.  5.  7.  7.  5.  5.  5.  5.  7.  5.  7.  5.  7.  5.  7.  5.  7.  5.
  7.  5.  7.  9.  9.  5.  9.  9.  5.  7.  5.  7.  5.  7.  5.  5.  7.  5.
  7.  5.  7.  5.  7.  5.  5.  5.]


In [9]:
print 'Air t-stat, p-value:', ttest_rel(air_pred, y_vgas_air.values)
# print 'Green t-stat, p-value:', ttest_rel(green_pred, y_vgas_greenhouse.values)

Air t-stat, p-value: (-4.9176898508738587, 4.6706226949063839e-06)


In [111]:
df_combo_slim[df_combo_slim['stnd'] == 1]['stnd']

array([ 1.])

In [None]:
# Second attempt Saved variables from FULL list
audi - (-2.2188007849009166, 0.041310345282488115)
bmw - (nan, nan)
volk - (1.3891177924157585, 0.17333055097468034)

In [None]:
# First Attempt Saved variables
dodge = dodge_air_pred - dodge_air_actual
# (nan, nan) - Same

merceb = merceb_air_pred - merceb_air_actual
# (-0.99999999999999645, 0.33556127786542422)

bmw = bmw_air_pred - bmw_air_actual
# (nan, nan) - Same

volk = volk_air_pred - volk_air_actual
# (-2.7050089040022969, 0.013447420158571064)

volktot = volktot_air_pred - volktot_air_actual
# (-2.1096325392232309, 0.037068101363996284)

In [None]:
plt.figure()
data = [dodge, merceb, bmw, volk, volktot]
bp = plt.boxplot(data)
plt.axhline(y=0, alpha=0.2)
for i in range(len(data)):
    y = data[i]
    x = np.random.normal(1+i, 0.04, size=len(y))
    plt.plot(x, y, 'r.', alpha=0.2, ms=20)

plt.savefig('boxplot_with_dots.png')
plt.show()

## Optional Volkswagen removal above ^

In [46]:
# Open combo manfill file
with open('data/df_combo_manfill_final.pkl', 'r') as fp6:
    df_combo_loaded = pickle.load(fp6)

# Drop NaNs
df_combo_slim = df_combo_loaded.dropna()


# # Select only Federal/California applications
# df_combo_slim = df_combo_slim[df_combo_slim['cert_region']==1]
# df_combo_slim.drop('cert_region', axis=1, inplace=True)


# Creating y variables and dropping them from feature set
y_airpollution = df_combo_slim['air_pollution_score']
y_greenhouse = df_combo_slim['greenhouse_gas_score']
df_combo_norm = df_combo_slim.drop(df_combo_slim[['air_pollution_score', 'greenhouse_gas_score']], axis=1)

In [47]:
# Selecting columns for model
# df_select = df_clean[['displ'
#                       , 'cyl'
#                       , 'trans'
#                       , 'drive'
#                       , 'fuel'
#                       , 'veh_class'
#                       , 'cert_region'
#                       , 'trans_speed']]

# # With greenhouse mpg factors
# df_select = df_clean[['displ'
#                       , 'cyl'
#                       , 'trans'
#                       , 'drive'
#                       , 'fuel'
#                       , 'veh_class'
#                       , 'cert_region'
#                       , 'city_mpg'
#                       , 'hwy_mpg'
#                       , 'cmb_mpg'
#                       , 'trans_speed']]

# # With new features included
# Selecting columns for model
df_select = df_combo_norm[['displ'
#                       , 'cyl'
#                       , 'trans'
#                       , 'drive'
#                       , 'fuel'
#                       , 'veh_class'
#                       , 'cert_region'
                      , 'trans_speed'
                      , 'weight'
                      , 'torque'
                      , 'torque_rpm'
                      , 'horsepower'
                      , 'msrp'
                      , 'city_mpg'
                      , 'hwy_mpg'
                      , 'cmb_mpg']]

# Modeling

In [None]:
# Base Model variables

# df_select = df_clean[['displ'
#                       , 'cyl'
#                       , 'trans'
#                       , 'drive'
#                       , 'fuel'
#                       , 'veh_class'
#                       , 'cert_region'
#                       , 'trans_speed']]
# split random_state=42
# Linear Regression
# - Air Pollution Score: 0.296211733074
# - Greenhouse Score: 0.732689269867
# Random Forest Regressor
# - Air Pollution Score: 0.311601562107
# - Greenhouse Score: 0.839053975657
# Logistic Regression
# - Air Pollution Score: 0.774011299435
# - Air Pollution Precision: 0.895357249551
# - Air Pollution Recall: 0.774011299435
# - Air Pollution f1(micro): 0.774011299435
# - Air Pollution f1(macro): 0.329043302465
# - Greenhouse Score: 0.482109227872
# - Greenhouse Precision: 0.690964046903
# - Greenhouse Recall: 0.482109227872
# - Greenhouse f1(micro): 0.482109227872
# - Greenhouse f1(macro): 0.238370227947
# Random Forest Classifier
# - Air Pollution Score: 0.811676082863
# - Air Pollution Precision: 0.852870296345
# - Air Pollution Recall: 0.811676082863
# - Air Pollution f1(micro): 0.811676082863
# - Air Pollution f1(macro): 0.448907222463
# - Greenhouse Score: 0.661016949153
# - Greenhouse Precision: 0.667978706076
# - Greenhouse Recall: 0.661016949153
# - Greenhouse f1(micro): 0.661016949153
# - Greenhouse f1(macro): 0.605011436189

# Better - Random Forest Classifier
# Air Pollution Score: 0.881720430108
# Air Pollution Precision: 0.874682470268
# Air Pollution Recall: 0.881720430108
# Air Pollution f1(micro): 0.881720430108
# Air Pollution f1(macro): 0.643830266216
# Greenhouse Score: 0.831541218638
# Greenhouse Precision: 0.832775731624
# Greenhouse Recall: 0.831541218638
# Greenhouse f1(micro): 0.831541218638
# Greenhouse f1(macro): 0.744561511434

In [48]:
# Test/Train split
X_airpollution_train, X_airpollution_test, y_airpollution_train, y_airpollution_test = train_test_split(df_select, y_airpollution, random_state=42)
X_greenhouse_train, X_greenhouse_test, y_greenhouse_train, y_greenhouse_test = train_test_split(df_select, y_greenhouse, random_state=42)

In [None]:
# # Linear Regression
# lr = LinearRegression()
# lr2 = LinearRegression()
# lr_air = lr.fit(X_airpollution_train, y_airpollution_train)
# lr_green = lr2.fit(X_greenhouse_train, y_greenhouse_train)

# print 'Air Pollution Score:', lr_air.score(X_airpollution_test, y_airpollution_test)
# #print 'predicts', lr_air.predict(X_airpollution_test)
# #print 'Air Parameters:', lr_air.coef_
# print 'Greenhouse Score:', lr_green.score(X_greenhouse_test, y_greenhouse_test)
# #print 'Green Parameters:', lr_green.coef_

In [None]:
# # Random Forest Regressor
# rf = RandomForestRegressor()
# rf2 = RandomForestRegressor()
# rf_air = rf.fit(X_airpollution_train, y_airpollution_train)
# rf_green = rf2.fit(X_greenhouse_train, y_greenhouse_train)

# print 'Air Pollution Score:', rf_air.score(X_airpollution_test, y_airpollution_test)
# #print 'Air importances:', rf_air.feature_importances_
# print 'Greenhouse Score:', rf_green.score(X_greenhouse_test, y_greenhouse_test)
# #print 'Green importances:', rf_green.feature_importances_

In [88]:
# Logistic Regression
logr = LogisticRegression()
logr2 = LogisticRegression()
logr_air = logr.fit(X_airpollution_train, y_airpollution_train)
logr_green = logr2.fit(X_greenhouse_train, y_greenhouse_train)

print 'Air Pollution Score:', logr_air.score(X_airpollution_test, y_airpollution_test)
# print 'Air Parameters:', logr_air.coef_
air_pred = logr_air.predict(X_airpollution_test)
print 'Air Pollution Precision:', precision_score(y_airpollution_test, air_pred)
print 'Air Pollution Recall:', recall_score(y_airpollution_test, air_pred)
print 'Air Pollution f1(micro):', f1_score(y_airpollution_test, air_pred, average='micro')
print 'Air Pollution f1(macro):', f1_score(y_airpollution_test, air_pred, average='macro')

print 'Greenhouse Score:', logr_green.score(X_greenhouse_test, y_greenhouse_test)
# print 'Green Parameters:', logr_green.coef_
green_pred = logr_green.predict(X_greenhouse_test)
print 'Greenhouse Precision:', precision_score(y_greenhouse_test, green_pred)
print 'Greenhouse Recall:', recall_score(y_greenhouse_test, green_pred)
print 'Greenhouse f1(micro):', f1_score(y_greenhouse_test, green_pred, average='micro')
print 'Greenhouse f1(macro):', f1_score(y_greenhouse_test, green_pred, average='macro')

Air Pollution Score: 0.642292490119
Air Pollution Precision: 0.594108834227
Air Pollution Recall: 0.642292490119
Air Pollution f1(micro): 0.642292490119
Air Pollution f1(macro): 0.281078796463
Greenhouse Score: 0.51581027668
Greenhouse Precision: 0.478963549819
Greenhouse Recall: 0.51581027668
Greenhouse f1(micro): 0.51581027668
Greenhouse f1(macro): 0.393334689701


  sample_weight=sample_weight)
  'precision', 'predicted', average, warn_for)
  sample_weight=sample_weight)
  'precision', 'predicted', average, warn_for)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


In [89]:
# Random Forest Classifier
rfc = RandomForestClassifier()
rfc2 = RandomForestClassifier()
rfc_air = rfc.fit(X_airpollution_train, y_airpollution_train)
rfc_green = rfc2.fit(X_greenhouse_train, y_greenhouse_train)

print 'Air Pollution Score:', rfc_air.score(X_airpollution_test, y_airpollution_test)
# print 'Air importances:', rfc_air.feature_importances_
air_pred = rfc_air.predict(X_airpollution_test)
print 'Air Pollution Precision:', precision_score(y_airpollution_test, air_pred)
print 'Air Pollution Recall:', recall_score(y_airpollution_test, air_pred)
print 'Air Pollution f1(micro):', f1_score(y_airpollution_test, air_pred, average='micro')
print 'Air Pollution f1(macro):', f1_score(y_airpollution_test, air_pred, average='macro')

print 'Greenhouse Score:', rfc_green.score(X_greenhouse_test, y_greenhouse_test)
#print 'Green importances:', rfc_green.feature_importances_
green_pred = rfc_green.predict(X_greenhouse_test)
print 'Greenhouse Precision:', precision_score(y_greenhouse_test, green_pred)
print 'Greenhouse Recall:', recall_score(y_greenhouse_test, green_pred)
print 'Greenhouse f1(micro):', f1_score(y_greenhouse_test, green_pred, average='micro')
print 'Greenhouse f1(macro):', f1_score(y_greenhouse_test, green_pred, average='macro')

Air Pollution Score: 0.863636363636
Air Pollution Precision: 0.849692582923
Air Pollution Recall: 0.863636363636
Air Pollution f1(micro): 0.863636363636
Air Pollution f1(macro): 0.602174291481
Greenhouse Score: 0.98418972332
Greenhouse Precision: 0.980697686526
Greenhouse Recall: 0.98418972332
Greenhouse f1(micro): 0.98418972332
Greenhouse f1(macro): 0.865774324679


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


## Boosted models

In [90]:
# AdaBoostClassifier
adac = AdaBoostClassifier(AdaBoostClassifier(
                            DecisionTreeClassifier(min_samples_leaf= 3,\
                                   min_samples_split= 1, random_state= 1,\
                                   max_features= 'sqrt', max_depth= 3),
                            algorithm='SAMME',
                            n_estimators=100,
                            learning_rate=1))
adac2 = AdaBoostClassifier()
adac_air = adac.fit(X_airpollution_train, y_airpollution_train)
adac_green = adac2.fit(X_greenhouse_train, y_greenhouse_train)

print 'Air Pollution Score:', adac_air.score(X_airpollution_test, y_airpollution_test)
# print 'Air importances:', adac_air.feature_importances_
air_pred = adac_air.predict(X_airpollution_test)
print 'Air Pollution Precision:', precision_score(y_airpollution_test, air_pred)
print 'Air Pollution Recall:', recall_score(y_airpollution_test, air_pred)
print 'Air Pollution f1(micro):', f1_score(y_airpollution_test, air_pred, average='micro')
print 'Air Pollution f1(macro):', f1_score(y_airpollution_test, air_pred, average='macro')

print 'Greenhouse Score:', adac_green.score(X_greenhouse_test, y_greenhouse_test)
#print 'Green importances:', adac_green.feature_importances_
green_pred = adac_green.predict(X_greenhouse_test)
print 'Greenhouse Precision:', precision_score(y_greenhouse_test, green_pred)
print 'Greenhouse Recall:', recall_score(y_greenhouse_test, green_pred)
print 'Greenhouse f1(micro):', f1_score(y_greenhouse_test, green_pred, average='micro')
print 'Greenhouse f1(macro):', f1_score(y_greenhouse_test, green_pred, average='macro')

Air Pollution Score: 0.849802371542
Air Pollution Precision: 0.838124768118
Air Pollution Recall: 0.849802371542
Air Pollution f1(micro): 0.849802371542
Air Pollution f1(macro): 0.58332673888
Greenhouse Score: 0.53557312253
Greenhouse Precision: 0.311544463772
Greenhouse Recall: 0.53557312253
Greenhouse f1(micro): 0.53557312253
Greenhouse f1(macro): 0.315048583327


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


In [None]:
ada_boost_grid = {'algorithm': ['SAMME', 'SAMME.R'],
                      'n_estimators': [5, 6, 10, 50, 100, 200, 500, 1000],
                      'learning_rate': [.7, 1, 1.5, 2]}

adab_gridsearch = GridSearchCV(AdaBoostClassifier(
                             DecisionTreeClassifier(min_samples_leaf= 4,\
                                   min_samples_split= 1, random_state= 1,\
                                   max_features= 'sqrt', max_depth= 3)),
                             ada_boost_grid,
                             n_jobs=-1,
                             verbose=True)
adab_gridsearch.fit(X_airpollution_train, y_airpollution_train)

print "best parameters:", adab_gridsearch.best_params_

best_adab_model = adab_gridsearch.best_estimator_
'''
Ended up with:
best parameters: {'min_samples_leaf': 1,
 'n_estimators': 350, 'min_samples_split': 1, 'random_state': 1,
  'max_features': None, 'max_depth': 3}
'''
'''
n = 100, lr = .7, alg='SAMME', max_depth=2:
Air Pollution Score: 0.863799283154
Air Pollution Precision: 0.868737734962
Air Pollution Recall: 0.863799283154
Air Pollution f1(micro): 0.863799283154
Air Pollution f1(macro): 0.481595977301
Greenhouse Score: 0.505376344086
Greenhouse Precision: 0.334376191233
Greenhouse Recall: 0.505376344086
Greenhouse f1(micro): 0.505376344086
Greenhouse f1(macro): 0.182026246719

bdt_real = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2),
    n_estimators=600,
    learning_rate=1)
'''

In [50]:
# GradientBoostingClassifier
gradc = GradientBoostingClassifier(min_samples_leaf= 3, n_estimators= 5800,\
                                   min_samples_split= 1, random_state= 1,\
                                   max_features= 'sqrt', max_depth= 3)
gradc2 = GradientBoostingClassifier()
gradc_air = gradc.fit(X_airpollution_train, y_airpollution_train)
gradc_green = gradc2.fit(X_greenhouse_train, y_greenhouse_train)

print 'Air Pollution Score:', gradc_air.score(X_airpollution_test, y_airpollution_test)
print 'Air importances:', gradc_air.feature_importances_
# air_pred = gradc_air.predict(X_airpollution_test)
# print 'Air Pollution Precision:', precision_score(y_airpollution_test, air_pred)
# print 'Air Pollution Recall:', recall_score(y_airpollution_test, air_pred)
# print 'Air Pollution f1(micro):', f1_score(y_airpollution_test, air_pred, average='micro')
# print 'Air Pollution f1(macro):', f1_score(y_airpollution_test, air_pred, average='macro')

# print 'Greenhouse Score:', gradc_green.score(X_greenhouse_test, y_greenhouse_test)
# # print 'Green importances:', gradc_green.feature_importances_
# green_pred = gradc_green.predict(X_greenhouse_test)
# print 'Greenhouse Precision:', precision_score(y_greenhouse_test, green_pred)
# print 'Greenhouse Recall:', recall_score(y_greenhouse_test, green_pred)
# print 'Greenhouse f1(micro):', f1_score(y_greenhouse_test, green_pred, average='micro')
# print 'Greenhouse f1(macro):', f1_score(y_greenhouse_test, green_pred, average='macro')

Air Pollution Score: 0.821428571429
Air importances: [ 0.10739476  0.03626315  0.11681595  0.09300866  0.08014326  0.10997614
  0.15723734  0.08601109  0.11738569  0.09576397]


In [None]:
a = [ 0.05379716,  0.00754642,  0.006416,    0.02145259,  0.01265013,  0.04673592,
  0.0963817,   0.05228231,  0.09528155,  0.06848256,  0.08583496,  0.07671296,
  0.1088317,   0.09209973,  0.08299305,  0.08152053,  0.01098074]
# ['displ'0               10
# , 'cyl'1                X
# , 'trans'2              X
# , 'drive'3              X
# , 'fuel'4               X
# , 'veh_class'5          X
# , 'cert_region'6        2
# , 'trans_speed'7        11
# , 'weight'8             3
# , 'torque'9             9
# , 'torque_rpm'10        5
# , 'horsepower'11         8
# , 'msrp'12               1
# , 'city_mpg'13           4
# , 'hwy_mpg'14            6
# , 'cmb_mpg'15            7
# , 'fuel_type'16]         X
argsort(a)

In [None]:
gradient_boost_grid = {'max_depth': [1, 3, 5],
                      'max_features': ['sqrt', 'log2', None],
                      'min_samples_split': [1, 3, 5],
                      'min_samples_leaf': [1, 3, 5],
                      'n_estimators': [50, 500, 1500],
                      'random_state': [1]}

gdbr_gridsearch = GridSearchCV(GradientBoostingClassifier(),
                             gradient_boost_grid,
                             n_jobs=-1,
                             verbose=True)
gdbr_gridsearch.fit(X_airpollution_train, y_airpollution_train)

print "best parameters:", gdbr_gridsearch.best_params_

best_gdbr_model = gdbr_gridsearch.best_estimator_
'''
Ended up with:
best parameters: {'min_samples_leaf': 1,
 'n_estimators': 350, 'min_samples_split': 1, 'random_state': 1,
  'max_features': None, 'max_depth': 3}


(min_samples_leaf= 3, n_estimators= 1300,\
min_samples_split= 1, random_state= 1,\
max_features= 'sqrt', max_depth= 3)

Air Pollution Score: 0.899641577061
Air Pollution Precision: 0.89528874115
Air Pollution Recall: 0.899641577061
Air Pollution f1(micro): 0.899641577061
Air Pollution f1(macro): 0.624588137009
Greenhouse Score: 0.838709677419
Greenhouse Precision: 0.839032984873
Greenhouse Recall: 0.838709677419
Greenhouse f1(micro): 0.838709677419
Greenhouse f1(macro): 0.743233216198
'''

## K-fold validation

In [32]:
y_airpollution.index = range(2023)
y_greenhouse.index = range(2023)
df_select.index = range(2023)
y_airpollution_k = y_airpollution.reshape(2023,1)
y_greenhouse_k = y_greenhouse.reshape(2023,1)

In [33]:
kf = KFold(1114, n_folds=5, shuffle=True)
air_acc_lst = []
air_prec_lst = []
air_rec_lst = []
green_acc_lst = []
green_prec_lst = []
green_rec_lst = []
for train_index, test_index in kf:
    X_airpollution_train, X_airpollution_test = df_select.loc[train_index], df_select.loc[test_index]
    y_airpollution_train, y_airpollution_test = y_airpollution_k[train_index], y_airpollution_k[test_index]
    
    X_greenhouse_train, X_greenhouse_test = df_select.loc[train_index], df_select.loc[test_index]
    y_greenhouse_train, y_greenhouse_test = y_greenhouse_k[train_index], y_greenhouse_k[test_index]
    
    # GradientBoostingClassifier
    gradc = GradientBoostingClassifier(min_samples_leaf= 3, n_estimators= 1300,\
                                       min_samples_split= 1, random_state= 1,\
                                       max_features= 'sqrt', max_depth= 3)
    gradc2 = GradientBoostingClassifier()
    gradc_air = gradc.fit(X_airpollution_train, y_airpollution_train)
    gradc_green = gradc2.fit(X_greenhouse_train, y_greenhouse_train)

    air_acc_lst.append(gradc_air.score(X_airpollution_test, y_airpollution_test))
    air_pred = gradc_air.predict(X_airpollution_test)
    air_prec_lst.append(precision_score(y_airpollution_test, air_pred))
    air_rec_lst.append(recall_score(y_airpollution_test, air_pred))

    green_acc_lst.append(gradc_green.score(X_greenhouse_test, y_greenhouse_test))
    green_pred = gradc_green.predict(X_greenhouse_test)
    green_prec_lst.append(precision_score(y_greenhouse_test, green_pred))
    green_rec_lst.append(recall_score(y_greenhouse_test, green_pred))

print 'air acc:', np.mean(air_acc_lst)
print 'air prec:', np.mean(air_prec_lst)
print 'air rec:', np.mean(air_rec_lst)
print 'green acc:', np.mean(green_acc_lst)
print 'green prec:', np.mean(green_prec_lst)
print 'green rec:', np.mean(green_rec_lst)

IndexError: index 1016 is out of bounds for axis 0 with size 1016

In [None]:
# K-fold validation scores:
# air acc: 0.903914677009
# air prec: 0.903600255365
# air rec: 0.903914677009
# green acc: 0.893156385085
# green prec: 0.904487319304
# green rec: 0.893156385085

## Run full model and save to pickle

In [51]:
# GradientBoostingClassifier
gradbc = GradientBoostingClassifier(min_samples_leaf= 3, n_estimators= 1300,\
                                   min_samples_split= 1, random_state= 1,\
                                   max_features= 'sqrt', max_depth= 3)
# gradbc2 = GradientBoostingClassifier()
gradbc_air = gradbc.fit(df_select, y_airpollution)
# gradbc_green = gradbc2.fit(df_select, y_greenhouse)

# Save model in pickle file
with open('data/model_gradboost_air15_california.pkl', 'w') as f:
        pickle.dump(gradbc_air, f)
# with open('data/model_gradboost_green15.pkl', 'w') as f:
#         pickle.dump(gradbc_green, f)

# Good up until here

# Web Scraping

## MotorTrend below

In [None]:
# Creating make/model df
carz = df_clean[df_clean['cert_region'] == 0]['model'].unique()
carz = list(carz)

make_temp = []
model_temp = []

for car in carz:
    car_temp = car.split()
    make_temp.append(car_temp[0])
    model_temp.append(car_temp[1:][0])

df_carz = pd.DataFrame(make_temp, columns = ['make'])
df_carz['model'] = model_temp
df_carz.drop_duplicates(keep='first', inplace=True)
df_carz.index = range(288)

# Loop and make links
whole = {}

specs = {}
for m in df_carz['make'].unique():
    specs[m] = {}

for i in xrange(df_carz.shape[0]):
    make = df_carz['make'][i]
    model = df_carz['model'][i]
    specs[make][model] = {}
    link = 'http://www.motortrend.com/cars/'
    link_complete = link + make + '/' + model + '/2015/specifications/'

    # Go to the link and get the html as a string
    html = requests.get(link_complete)
    if html.status_code != 200:
        specs[make][model]['msrp'] = np.nan
        specs[make][model]['fuel_type'] = np.nan
        specs[make][model]['weight'] = np.nan
        specs[make][model]['torque'] = np.nan
        specs[make][model]['torque_rpm'] = np.nan
        specs[make][model]['horsepower'] = np.nan
        whole[link_complete] = 'Error'
    else:
        soup2 = bs4.BeautifulSoup(html.content, 'html.parser')
        whole[link_complete] = html.content
        time.sleep(2)

        lines_price = soup2.find_all('span')
        for line in lines_price:
            if line.get('itemprop') != None:
                if line.get('itemprop') == 'price':
                    specs[make][model]['msrp'] = line.string
                if line.get('itemprop') == 'fuelType':
                    specs[make][model]['fuel_type'] = line.string
        
        lines_weight = soup2.find_all('div', attrs={'class': 'key'})
        for line in lines_weight:
            if line.string == 'Curb Weight':
                specs[make][model]['weight'] = line.next.next.string
            if line.string == 'Torque':
                specs[make][model]['torque'] =  line.next.next.string
            if line.string == 'Torque (rpm)':
                specs[make][model]['torque_rpm'] =  line.next.next.string
            if line.string == 'Horsepower':
                if '@' not in line.next.next.string:
                    specs[make][model]['horsepower'] = line.next.next.string

In [None]:
# Save scrape locally
with open('data/motortrend_scrape_2015.json', 'w') as fp1:
    json.dump(whole, fp1)

# Save scrape specifics locally
with open('data/motortrend_specs_2015.json', 'w') as fp2:
    json.dump(specs, fp2)

## Testing below

In [None]:
# Reading in json to check NaN reasoning
with open('data/motortrend_specs_2015.json', 'r') as fp:
    s_temp2 = json.load(fp)

# Creating df of new features
user_ids2 = []
frames2 = []

for user_id2, d2 in s_temp2.iteritems():
    user_ids2.append(user_id2)
    frames2.append(pd.DataFrame.from_dict(d2, orient='index'))

s_temp2 = pd.concat(frames2, keys=user_ids2)
s_temp2['model'] = zip(s_temp2.index.get_level_values(0), s_temp2.index.get_level_values(1))
s_temp2['model'] = s_temp2['model'].apply(lambda x: x[0] + ' ' + x[1])
s_temp2 = s_temp2.reset_index(level=1, drop=True)

In [None]:
link = 'http://www.motortrend.com/cars/'
    link_complete = link + make + '/' + model + '/2015/specifications/'

In [None]:
# Left_Outer join of df_clean and new features
df_combo4 = df_clean.merge(s_temp2, how='left', left_on='model', right_on='model')
# Set correct values
df_combo4['weight'] = df_combo4[df_combo4['weight'].notnull()]['weight'].apply(lambda x: x[:-5])
df_combo4[df_combo4['weight'] == ''] = np.nan
df_combo4['msrp'] = df_combo4[df_combo4['msrp'].notnull()]['msrp'].apply(lambda x: int(x.replace(',', '')))

# Convert 'fuel_type' feature to int categorical
fuel_type_conv = {'Unleaded Regular': 0
           , 'Unleaded Midgrade': 1
           , 'Unleaded Premium': 2
           , 'Diesel': 3}
df_combo4.loc[:, 'fuel_type'] = df_combo4.loc[:, 'fuel_type'].map(fuel_type_conv)
df_combo4.ix[1090:1091] = df_clean.ix[1090:1091]

In [None]:
# Creating make/model df
carz = df_combo4[(df_combo4['cert_region'] == 0) & (df_combo4['weight'].isnull())]['model'].unique()
carz = list(carz)

make_temp = []
model_temp = []

for car in carz:
    car_temp = car.split()
    make_temp.append(car_temp[0])
    model_temp.append(''.join(car_temp[1:]))
    
df_carz = pd.DataFrame(make_temp, columns = ['make'])
df_carz['model'] = model_temp
df_carz.drop_duplicates(keep='first', inplace=True)
df_carz.index = range(356)

# Loop and make links
whole = {}

specs = {}
for m in df_carz['make'].unique():
    specs[m] = {}

for i in xrange(df_carz.shape[0]):
    make = df_carz['make'][i]
    model = df_carz['model'][i]
    specs[make][model] = {}
    link = 'http://www.motortrend.com/cars/'
    link_complete = link + make + '/' + model + '/2015/specifications/'

    # Go to the link and get the html as a string
    html = requests.get(link_complete)
    if html.status_code != 200:
        specs[make][model]['msrp'] = np.nan
        specs[make][model]['fuel_type'] = np.nan
        specs[make][model]['weight'] = np.nan
        specs[make][model]['torque'] = np.nan
        specs[make][model]['torque_rpm'] = np.nan
        specs[make][model]['horsepower'] = np.nan
        whole[link_complete] = 'Error'
    else:
        soup2 = bs4.BeautifulSoup(html.content, 'html.parser')
        whole[link_complete] = html.content
        time.sleep(2)

        lines_price = soup2.find_all('span')
        for line in lines_price:
            if line.get('itemprop') != None:
                if line.get('itemprop') == 'price':
                    specs[make][model]['msrp'] = line.string
                if line.get('itemprop') == 'fuelType':
                    specs[make][model]['fuel_type'] = line.string
        
        lines_weight = soup2.find_all('div', attrs={'class': 'key'})
        for line in lines_weight:
            if line.string == 'Curb Weight':
                specs[make][model]['weight'] = line.next.next.string
            if line.string == 'Torque':
                specs[make][model]['torque'] =  line.next.next.string
            if line.string == 'Torque (rpm)':
                specs[make][model]['torque_rpm'] =  line.next.next.string
            if line.string == 'Horsepower':
                if '@' not in line.next.next.string:
                    specs[make][model]['horsepower'] = line.next.next.string

In [None]:
# Save scrape locally
with open('data/motortrend_scrape_2015_v2.json', 'w') as fp1:
    json.dump(whole, fp1)

# Save scrape specifics locally
with open('data/motortrend_specs_2015_v2.json', 'w') as fp2:
    json.dump(specs, fp2)

# Filling NaNs manually

In [None]:
# No sites for:
'BMW Alpina B7 LWB'
'BMW Alpina B7 SWB'
'BMW M235i'
'BMW M235i Convertible'
'BUGATTI Veyron'
'CHEVROLET Impala Dual Fuel'
'CHEVROLET Sonic 5'
'FORD Explorer FFV'
'FORD Focus FFV'
'FORD Taurus FFV'
'JAGUAR XF FFV'
'JAGUAR XJ FFV'
'JAGUAR XJL FFV'
'LAMBORGHINI Veneno Roadster'
'LAND ROVER Range Rover FFV'
'LAND ROVER Range Rover L FFV'
'LAND ROVER Range Rover Sport FFV'
'LINCOLN MKT Livery'
'MAZDA MX-5'
'NISSAN Pathfinder Hybrid'
'TOYOTA Sequoia FFV'
'TOYOTA Tundra FFV'

In [None]:
df_combo_fill = df_combo

In [None]:
zip(range(356), carz)

In [None]:
dict_links = {}

In [None]:
part_link = 'http://www.motortrend.com/cars/volvo/xc70/'
dict_links[carz[355]] = part_link

In [None]:
dict_links

In [None]:
# Save link dict locally
with open('data/motortrend_links.json', 'w') as fp2:
    json.dump(dict_links, fp2)

In [None]:
# Open link dict locally
with open('data/motortrend_links.json', 'r') as fp3:
    linked_dict = json.load(fp3)

In [None]:
len(linked_dict)

In [None]:
# # Combine auto scrape with manual scrape
# # Open link dict locally
# with open('data/motortrend_links.json', 'r') as fp3:
#     linked_dict = json.load(fp3)

# whole_nan = {}
# for key in linked_dict.keys():
#     new_link = linked_dict[key]
#     whole_nan[key] = {}
    
#     if new_link[-1] == '/':
#         link_complete = new_link + '2015/specifications/'
#     else:
#         link_complete = new_link

#     # Go to the link and get the html as a string
#     html = requests.get(link_complete)
#     if html.status_code != 200:
#         df_combo.loc[df_combo['model'] == key, 'msrp'] = np.nan
#         df_combo.loc[df_combo['model'] == key, 'fuel_type'] = np.nan
#         df_combo.loc[df_combo['model'] == key, 'weight'] = np.nan
#         df_combo.loc[df_combo['model'] == key, 'torque'] = np.nan
#         df_combo.loc[df_combo['model'] == key, 'torque_rpm'] = np.nan
#         df_combo.loc[df_combo['model'] == key, 'horsepower'] = np.nan
#         whole_nan[key][link_complete] = 'Error'
#     else:
#         soup2 = bs4.BeautifulSoup(html.content, 'html.parser')
#         whole_nan[key][link_complete] = html.content
#         time.sleep(2)

#         lines_price = soup2.find_all('span')
#         for line in lines_price:
#             if line.get('itemprop') != None:
#                 if line.get('itemprop') == 'price':
#                     df_combo.loc[df_combo['model'] == key, 'msrp'] = line.string
#                 if line.get('itemprop') == 'fuelType':
#                     df_combo.loc[df_combo['model'] == key, 'fuel_type'] = line.string
        
#         lines_weight = soup2.find_all('div', attrs={'class': 'key'})
#         for line in lines_weight:
#             if line.string == 'Curb Weight':
#                 df_combo.loc[df_combo['model'] == key, 'weight'] = line.next.next.string
#             if line.string == 'Torque':
#                 df_combo.loc[df_combo['model'] == key, 'torque'] =  line.next.next.string
#             if line.string == 'Torque (rpm)':
#                 df_combo.loc[df_combo['model'] == key, 'torque_rpm'] =  line.next.next.string
#             if line.string == 'Horsepower':
#                 if '@' not in line.next.next.string:
#                     df_combo.loc[df_combo['model'] == key, 'horsepower'] = line.next.next.string

In [None]:
# Save link dict locally
with open('data/motortrend_specs_2015_leftovers_v2.json', 'w') as fp4:
    json.dump(whole_nan, fp4)

In [30]:
# Open link dict locally
with open('data/motortrend_links.json', 'r') as fp3:
    linked_dict = json.load(fp3)

# Open link json locally
with open('data/motortrend_specs_2015_leftovers_v2.json', 'r') as fp4:
    whole_fill = json.load(fp4)
    
    

for key2 in linked_dict.keys():
    new_link2 = linked_dict[key2]

    if new_link2[-1] == '/':
        link_complete2 = new_link2 + '2015/specifications/'
    else:
        link_complete2 = new_link2
    

    if whole_fill[key2][link_complete2] == 'Error':
            df_combo.loc[df_combo['model'] == key2, 'msrp'] = np.nan
            df_combo.loc[df_combo['model'] == key2, 'fuel_type'] = np.nan
            df_combo.loc[df_combo['model'] == key2, 'weight'] = np.nan
            df_combo.loc[df_combo['model'] == key2, 'torque'] = np.nan
            df_combo.loc[df_combo['model'] == key2, 'torque_rpm'] = np.nan
            df_combo.loc[df_combo['model'] == key2, 'horsepower'] = np.nan
    else:
        soup2 = bs4.BeautifulSoup(whole_fill[key2][link_complete2], 'html.parser')

        lines_price = soup2.find_all('span')
        for line in lines_price:
            if line.get('itemprop') != None:
                if line.get('itemprop') == 'price':
                    df_combo.loc[df_combo['model'] == key2, 'msrp'] = str(line.string)
                if line.get('itemprop') == 'fuelType':
                    df_combo.loc[df_combo['model'] == key2, 'fuel_type'] = str(line.string)
        
        lines_weight = soup2.find_all('div', attrs={'class': 'key'})
        for line in lines_weight:
            if line.string == 'Curb Weight':
                df_combo.loc[df_combo['model'] == key2, 'weight'] = str(line.next.next.string)
            if line.string == 'Torque':
                df_combo.loc[df_combo['model'] == key2, 'torque'] =  str(line.next.next.string)
            if line.string == 'Torque (rpm)':
                df_combo.loc[df_combo['model'] == key2, 'torque_rpm'] =  str(line.next.next.string)
            if line.string == 'Horsepower':
                if '@' not in line.next.next.string:
                    df_combo.loc[df_combo['model'] == key2, 'horsepower'] = str(line.next.next.string)


In [45]:
str(df_combo['weight'][0])

'2955'

In [46]:
df_combo2 = df_combo

In [68]:
df_combo2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2124 entries, 0 to 2123
Data columns (total 24 columns):
model                   2124 non-null object
displ                   2124 non-null float64
cyl                     2124 non-null float64
trans                   2124 non-null float64
drive                   2124 non-null float64
fuel                    2124 non-null float64
cert_region             2124 non-null float64
stnd                    2124 non-null float64
stnd_description        2124 non-null object
underhood_id            2124 non-null object
veh_class               2124 non-null float64
air_pollution_score     2124 non-null float64
city_mpg                2124 non-null float64
hwy_mpg                 2124 non-null float64
cmb_mpg                 2124 non-null float64
greenhouse_gas_score    2124 non-null float64
smartway                2124 non-null float64
comb_co2                2124 non-null float64
trans_speed             2124 non-null float64
weight                

In [75]:
df_combo2[['weight', 'torque', 'torque_rpm', 'horsepower', 'msrp']] = \
    df_combo2[['weight', 'torque', 'torque_rpm', 'horsepower', 'msrp']].astype(float64)

In [76]:
df_combo2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2023 entries, 0 to 2123
Data columns (total 24 columns):
model                   2023 non-null object
displ                   2023 non-null float64
cyl                     2023 non-null float64
trans                   2023 non-null float64
drive                   2023 non-null float64
fuel                    2023 non-null float64
cert_region             2023 non-null float64
stnd                    2023 non-null float64
stnd_description        2023 non-null object
underhood_id            2023 non-null object
veh_class               2023 non-null float64
air_pollution_score     2023 non-null float64
city_mpg                2023 non-null float64
hwy_mpg                 2023 non-null float64
cmb_mpg                 2023 non-null float64
greenhouse_gas_score    2023 non-null float64
smartway                2023 non-null float64
comb_co2                2023 non-null float64
trans_speed             2023 non-null float64
weight                

In [31]:
# Setting correct Lamborghini Huracan weight
df_combo.loc[1090:1091, 'weight'] = '3135'

# Correcting cleaning weight's strings
df_combo.loc[:, 'weight'] = df_combo.loc[:, 'weight'].apply(lambda x: str(x)[:4])

# Dropping fuel_type
df_combo.drop('fuel_type', axis=1, inplace=True)

# Dropping NaNs
df_combo.dropna(inplace=True)

# Cleaning artifacts from msrp's price
df_combo['msrp'] = df_combo[df_combo['msrp'].notnull()]['msrp'].apply(lambda x: str(x).replace(',', ''))
df_combo['msrp'] = df_combo[df_combo['msrp'].notnull()]['msrp'].apply(lambda x: int(str(x).replace('.0', '')))
    
df_combo[['weight', 'torque', 'torque_rpm', 'horsepower', 'msrp']] = \
    df_combo[['weight', 'torque', 'torque_rpm', 'horsepower', 'msrp']].astype(float64)


In [40]:
df_combo[df_combo['weight'] == '']['weight']

0        
1        
2        
3        
4        
5        
6        
7        
8        
9        
10       
11       
12       
13       
14       
15       
16       
17       
18       
19       
42       
43       
44       
45       
46       
47       
52       
53       
54       
55       
       ..
2066     
2067     
2068     
2069     
2072     
2073     
2074     
2075     
2078     
2079     
2080     
2081     
2082     
2083     
2084     
2085     
2086     
2087     
2090     
2091     
2092     
2093     
2094     
2095     
2096     
2097     
2098     
2099     
2100     
2101     
Name: weight, dtype: object

In [33]:
len(df_combo[df_combo['fuel']==1]['model'].unique())

29

In [34]:
# Save filled df to local pickle
with open('data/df_combo_manfill_final.pkl', 'w') as fp5:
    pickle.dump(df_combo, fp5)