In [0]:
%%capture
!pip install plotly

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px

pd.set_option("display.max_columns", 100)

  import pandas.util.testing as tm


# Street Smarts

## Data

#### Orignal data source found [here](https://www.fueleconomy.gov/feg/download.shtml) (fueleconomy.gov)

#### Data Dicitonary found [here](https://www.fueleconomy.gov/feg/ws/index.shtml#vehicle)

#### Fuel Economy guide for 2020 vehicles [here](https://www.fueleconomy.gov/feg/pdfs/guides/FEG2020.pdf)

#### Find the actual values used that create the EPA estimates for fuel mileage [here](https://www.fueleconomy.gov/feg/Find.do?action=bt1). Read the fin print on each label to find out how its calculated

## Evaluation Data (potential)

Even though a dedicated Co2 column doesnt exist until 2013, we can still have access to over [3 million](https://www.cars.com/for-sale/searchresults.action/?page=1&perPage=20&rd=99999&searchSource=GN_REFINEMENT&sort=relevance&yrId=47272%2C51683%2C56007%2C58487%2C30031936%2C35797618%2C36362520%2C36620293&zc=98118) listings using cars.com alone, creating a substantial evaluation set. Cars.com has an API found here https://developer.cars.com/apis

In [0]:
# Data from EPA includes all vehicles from 
# path = "../data/vehicles.csv"
path = "https://raw.githubusercontent.com/mpHarm88/streetsmart/master/data/vehicles.csv"
df = pd.read_csv(path)
print(f"The shape of the data: {df.shape}")
print(df.columns)

The shape of the data: (42230, 83)
Index(['barrels08', 'barrelsA08', 'charge120', 'charge240', 'city08',
       'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2',
       'co2A', 'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U',
       'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'cylinders',
       'displ', 'drive', 'engId', 'eng_dscr', 'feScore', 'fuelCost08',
       'fuelCostA08', 'fuelType', 'fuelType1', 'ghgScore', 'ghgScoreA',
       'highway08', 'highway08U', 'highwayA08', 'highwayA08U', 'highwayCD',
       'highwayE', 'highwayUF', 'hlv', 'hpv', 'id', 'lv2', 'lv4', 'make',
       'model', 'mpgData', 'phevBlended', 'pv2', 'pv4', 'range', 'rangeCity',
       'rangeCityA', 'rangeHwy', 'rangeHwyA', 'trany', 'UCity', 'UCityA',
       'UHighway', 'UHighwayA', 'VClass', 'year', 'youSaveSpend', 'guzzler',
       'trans_dscr', 'tCharger', 'sCharger', 'atvType', 'fuelType2', 'rangeA',
       'evMotor', 'mfrCode', 'c240Dscr', 'charge240b', 'c240bDsc


Columns (70,71,72,73,74,76,79) have mixed types.Specify dtype option on import or set low_memory=False.



In [0]:
# Years included and their counts
df["year"].value_counts()

1984    1964
1985    1701
2018    1344
2019    1335
2017    1293
2015    1283
2016    1262
1987    1247
2014    1225
1986    1210
2020    1204
2008    1187
2013    1184
2009    1184
2005    1166
1989    1153
2012    1152
1991    1132
2011    1130
1988    1130
2007    1126
2004    1122
1992    1121
2010    1109
2006    1104
1993    1093
1990    1078
2003    1044
1994     982
2002     975
1995     967
2001     911
1999     852
2000     840
1998     812
1996     773
1997     762
2021      73
Name: year, dtype: int64

In [0]:
df.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,Rear-Wheel Drive,9011,(FFS),-1,1300,0,Regular,Regular Gasoline,-1,-1,25,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0,0,Alfa Romeo,Spider Veloce 2000,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.3333,0.0,35.0,0.0,Two Seaters,1985,-1500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,12.0,4.9,Rear-Wheel Drive,22020,(GUZZLER),-1,2450,0,Regular,Regular Gasoline,-1,-1,14,0.0,0,0.0,0.0,0.0,0.0,0,0,10,0,0,Ferrari,Testarossa,N,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,11.0,0.0,19.0,0.0,Two Seaters,1985,-7250,T,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,329.148148,27,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,2100,(FFS),-1,1000,0,Regular,Regular Gasoline,-1,-1,33,0.0,0,0.0,0.0,0.0,0.0,19,77,100,0,0,Dodge,Charger,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,29.0,0.0,47.0,0.0,Subcompact Cars,1985,0,,SIL,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,8.0,5.2,Rear-Wheel Drive,2850,,-1,2450,0,Regular,Regular Gasoline,-1,-1,12,0.0,0,0.0,0.0,0.0,0.0,0,0,1000,0,0,Dodge,B150/B250 Wagon 2WD,N,False,0,0,0,0.0,0.0,0.0,0.0,Automatic 3-spd,12.2222,0.0,16.6667,0.0,Vans,1985,-7250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,467.736842,19,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66031,"(FFS,TRBO)",-1,2000,0,Premium,Premium Gasoline,-1,-1,23,0.0,0,0.0,0.0,0.0,0.0,0,0,10000,0,14,Subaru,Legacy AWD Turbo,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,21.0,0.0,32.0,0.0,Compact Cars,1993,-5000,,,T,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [0]:
df.tail()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
42225,14.982273,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,403.954545,22,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,66030,(FFS),-1,1250,0,Regular,Regular Gasoline,-1,-1,26,0.0,0,0.0,0.0,0.0,0.0,0,0,9995,0,14,Subaru,Legacy,N,False,0,90,0,0.0,0.0,0.0,0.0,Automatic 4-spd,24.0,0.0,37.0,0.0,Compact Cars,1993,-1250,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
42226,14.33087,0.0,0.0,0.0,20,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,386.391304,23,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,66030,(FFS),-1,1200,0,Regular,Regular Gasoline,-1,-1,28,0.0,0,0.0,0.0,0.0,0.0,0,0,9996,0,14,Subaru,Legacy,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,25.0,0.0,39.0,0.0,Compact Cars,1993,-1000,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
42227,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66030,(FFS),-1,1300,0,Regular,Regular Gasoline,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,0,0,9997,0,14,Subaru,Legacy AWD,Y,False,0,90,0,0.0,0.0,0.0,0.0,Automatic 4-spd,23.0,0.0,34.0,0.0,Compact Cars,1993,-1500,,CLKUP,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
42228,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66030,(FFS),-1,1300,0,Regular,Regular Gasoline,-1,-1,24,0.0,0,0.0,0.0,0.0,0.0,0,0,9998,0,14,Subaru,Legacy AWD,Y,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.0,0.0,34.0,0.0,Compact Cars,1993,-1500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
42229,18.311667,0.0,0.0,0.0,16,0.0,0,0.0,0.0,0.0,0.0,-1,-1,0.0,493.722222,18,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66031,"(FFS,TRBO)",-1,2100,0,Premium,Premium Gasoline,-1,-1,21,0.0,0,0.0,0.0,0.0,0.0,0,0,9999,0,14,Subaru,Legacy AWD Turbo,N,False,0,90,0,0.0,0.0,0.0,0.0,Automatic 4-spd,20.0,0.0,29.0,0.0,Compact Cars,1993,-5500,,CLKUP,T,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [0]:
# Find NaN
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42230 entries, 0 to 42229
Data columns (total 83 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   barrels08        42230 non-null  float64
 1   barrelsA08       42230 non-null  float64
 2   charge120        42230 non-null  float64
 3   charge240        42230 non-null  float64
 4   city08           42230 non-null  int64  
 5   city08U          42230 non-null  float64
 6   cityA08          42230 non-null  int64  
 7   cityA08U         42230 non-null  float64
 8   cityCD           42230 non-null  float64
 9   cityE            42230 non-null  float64
 10  cityUF           42230 non-null  float64
 11  co2              42230 non-null  int64  
 12  co2A             42230 non-null  int64  
 13  co2TailpipeAGpm  42230 non-null  float64
 14  co2TailpipeGpm   42230 non-null  float64
 15  comb08           42230 non-null  int64  
 16  comb08U          42230 non-null  float64
 17  combA08     

### Numerics

"co2" column looks to be filled with -1 values which im assuming was a placeholder for NaN. I'm assuming there were some years where CO2 was not recorded due to global warming not being a huge concern during those times (pre 2000)

In [0]:
df.describe()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,engId,feScore,fuelCost08,fuelCostA08,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,UCity,UCityA,UHighway,UHighwayA,year,youSaveSpend,charge240b,phevCity,phevHwy,phevComb
count,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,41990.0,41992.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0,42230.0
mean,17.208441,0.22161,0.0,0.054709,18.512929,6.441259,0.72645,0.584693,0.00045,0.397819,0.002062,95.620791,5.961804,17.551395,464.279643,20.757755,7.17349,0.784608,0.6208,0.404657,0.000347,0.002042,5.713503,3.289591,7963.338054,0.474473,1601.63391,96.807956,0.473692,-0.920554,24.641748,8.441684,0.886834,0.690087,0.000232,0.413245,0.002015,1.992375,10.183614,21238.705115,1.79479,6.092351,13.461591,33.772839,0.978025,0.957081,0.113151,0.909845,0.105257,23.410311,0.948664,34.555428,1.228918,2002.011414,-3005.209567,0.012583,0.191309,0.192896,0.191191
std,4.649507,1.142513,0.0,0.665389,8.355167,12.181091,5.657377,5.52782,0.038272,4.268703,0.033192,180.085172,57.549604,93.436211,124.384072,8.062297,12.826143,5.668628,5.492966,4.29884,0.033535,0.032815,1.758066,1.356875,17082.701784,2.776665,482.583847,509.667828,2.777604,0.658698,8.026049,14.294617,5.851538,5.606418,0.028039,4.354442,0.032346,5.921842,27.940778,12319.604224,4.348795,9.605151,31.016657,46.048681,14.874041,15.00613,2.294075,14.549553,2.049242,11.881505,7.954267,11.664992,8.332576,11.382099,2414.683502,0.278699,3.235159,3.127795,3.166686
min,0.06,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,-1.0,450.0,0.0,-1.0,-1.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1984.0,-22000.0,0.0,0.0,0.0,0.0
25%,14.33087,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,386.391304,17.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.2,0.0,-1.0,1250.0,0.0,-1.0,-1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10558.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.3,0.0,28.0,0.0,1991.0,-4500.0,0.0,0.0,0.0,0.0
50%,16.4805,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,444.35,20.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,3.0,172.0,-1.0,1500.0,0.0,-1.0,-1.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21117.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.5,0.0,33.3333,0.0,2003.0,-2500.0,0.0,0.0,0.0,0.0
75%,19.388824,0.0,0.0,0.0,21.0,15.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,522.764706,23.0,17.394975,0.0,0.0,0.0,0.0,0.0,6.0,4.3,4134.0,-1.0,1900.0,0.0,-1.0,-1.0,28.0,21.4451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31949.75,0.0,13.0,0.0,91.0,0.0,0.0,0.0,0.0,0.0,26.0,0.0,39.1,0.0,2012.0,-1250.0,0.0,0.0,0.0,0.0
max,47.087143,18.311667,0.0,13.0,150.0,150.0,145.0,145.0835,5.35,122.0,0.927,847.0,713.0,713.0,1269.571429,141.0,140.5603,133.0,133.2662,121.0,4.8,0.92,16.0,8.4,69102.0,10.0,5400.0,3800.0,10.0,8.0,132.0,131.9991,121.0,121.2005,4.06,120.0,0.91,49.0,195.0,42590.0,41.0,55.0,194.0,192.0,373.0,384.3318,135.28,358.5551,114.76,224.8,207.2622,187.1,173.1436,2021.0,2750.0,8.5,97.0,81.0,88.0


### Categoricals

In [0]:
df.describe(exclude="number")

Unnamed: 0,drive,eng_dscr,fuelType,fuelType1,make,model,mpgData,phevBlended,trany,VClass,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,c240bDscr,createdOn,modifiedOn,startStop
count,41041,25898,42230,42230,42230,42230,42230,42230,42219,42230,2483,15047,7442,878,3762,1673,1668,1009,11412,101,95,42230,42230,10526
unique,7,560,14,6,137,4217,2,2,38,34,3,52,1,1,8,4,231,170,49,5,7,312,176,2
top,Front-Wheel Drive,(FFS),Regular,Regular Gasoline,Chevrolet,F150 Pickup 2WD,N,False,Automatic 4-spd,Compact Cars,G,CLKUP,T,S,FFV,E85,290,288V Ni-MH,GMX,standard charger,80 amp dual charger,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,N
freq,14515,8827,26913,28335,4086,224,28934,42104,11048,5988,1504,7809,7442,878,1466,1466,74,131,1639,90,55,34217,29343,6675


### Find where co2 feature equals -1, and how many

Seeing that there are ~31,000 columns where co2 equals one, it becomes a problem if we are able to find out how to calculate co2 for these older vehicles

In [0]:
# Rows where co2 equals -1
df[df["co2"] == -1].shape

(31954, 83)

In [0]:
# Rows where data doesnt equal -1
df[df["co2"] != -1].shape

(10276, 83)

In [0]:
# Are these Electric Vehicles?
df[df["co2"] == 0].shape

(237, 83)

In [0]:
# wrangle

def cleaner(df):
    df = df.copy()
    
    
    
    return df

In [0]:
df.columns

Index(['barrels08', 'barrelsA08', 'charge120', 'charge240', 'city08',
       'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2',
       'co2A', 'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U',
       'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'cylinders',
       'displ', 'drive', 'engId', 'eng_dscr', 'feScore', 'fuelCost08',
       'fuelCostA08', 'fuelType', 'fuelType1', 'ghgScore', 'ghgScoreA',
       'highway08', 'highway08U', 'highwayA08', 'highwayA08U', 'highwayCD',
       'highwayE', 'highwayUF', 'hlv', 'hpv', 'id', 'lv2', 'lv4', 'make',
       'model', 'mpgData', 'phevBlended', 'pv2', 'pv4', 'range', 'rangeCity',
       'rangeCityA', 'rangeHwy', 'rangeHwyA', 'trany', 'UCity', 'UCityA',
       'UHighway', 'UHighwayA', 'VClass', 'year', 'youSaveSpend', 'guzzler',
       'trans_dscr', 'tCharger', 'sCharger', 'atvType', 'fuelType2', 'rangeA',
       'evMotor', 'mfrCode', 'c240Dscr', 'charge240b', 'c240bDscr',
       'createdOn', 'modifiedOn

## Subsetting electric vehicles only and their variants

In [0]:
#Subset rows where evmotor is not False
df_ev = df[~df["evMotor"].isnull()]

# Pick out desirable columns

ls = ['make', 'model', 'year', 'charge240', 'city08','city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2','co2A', 
      'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U','combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF',
      'feScore', 'fuelCost08', 'fuelCostA08', 'fuelType', 'fuelType1', 'ghgScore', 'ghgScoreA','highway08', 'highway08U', 
      'highwayA08', 'highwayA08U', 'highwayCD','highwayE', 'highwayUF', 'mpgData', 'phevBlended', 'rangeCityA', 'rangeHwy', 'rangeHwyA', 
      'trany', 'UCity', 'UCityA', 'UHighway', 'UHighwayA', 'VClass', 'youSaveSpend', 'guzzler', 'tCharger', 'sCharger', 'atvType', 'fuelType2', 'rangeA',
       'evMotor', 'c240Dscr', 'charge240b', 'c240bDscr', 'startStop', 'phevCity', 'phevHwy',
       'phevComb']

df_ev[ls].head()

Unnamed: 0,make,model,year,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,mpgData,phevBlended,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,youSaveSpend,guzzler,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,c240Dscr,charge240b,c240bDscr,startStop,phevCity,phevHwy,phevComb
7138,Nissan,Altra EV,2000,0.0,81,0.0,0,0.0,0.0,41.0,0.0,0,-1,0.0,0.0,85,0.0,0,0.0,40.0,0.0,0.0,-1,800,0,Electricity,Electricity,-1,-1,91,0.0,0,0.0,0.0,37.0,0.0,N,False,0.0,0.0,0.0,,116.2069,0.0,129.6154,0.0,Midsize Station Wagons,1000,,,,EV,,,62 KW AC Induction,,0.0,,N,0,0,0
7139,Toyota,RAV4 EV,2000,0.0,81,0.0,0,0.0,0.0,41.0,0.0,0,-1,0.0,0.0,72,0.0,0,0.0,47.0,0.0,0.0,-1,900,0,Electricity,Electricity,-1,-1,64,0.0,0,0.0,0.0,53.0,0.0,N,False,0.0,0.0,0.0,,116.2069,0.0,91.0811,0.0,Sport Utility Vehicle - 2WD,500,,,,EV,,,50 KW DC,,0.0,,N,0,0,0
8143,Toyota,RAV4 EV,2001,0.0,81,0.0,0,0.0,0.0,41.0,0.0,0,-1,0.0,0.0,72,0.0,0,0.0,47.0,0.0,0.0,-1,900,0,Electricity,Electricity,-1,-1,64,0.0,0,0.0,0.0,53.0,0.0,N,False,0.0,0.0,0.0,,116.2069,0.0,91.0811,0.0,Sport Utility Vehicle - 2WD,500,,,,EV,,,50 KW DC,,0.0,,N,0,0,0
8144,Ford,Th!nk,2001,0.0,74,0.0,0,0.0,0.0,46.0,0.0,0,-1,0.0,0.0,65,0.0,0,0.0,52.0,0.0,0.0,-1,1000,0,Electricity,Electricity,-1,-1,58,0.0,0,0.0,0.0,59.0,0.0,N,False,0.0,0.0,0.0,,105.3125,0.0,82.1951,0.0,Two Seaters,0,,,,EV,,,27 KW AC Induction,,0.0,,N,0,0,0
8146,Ford,Explorer USPS Electric,2001,0.0,45,0.0,0,0.0,0.0,75.0,0.0,0,-1,0.0,0.0,39,0.0,0,0.0,87.0,0.0,0.0,-1,1700,0,Electricity,Electricity,-1,-1,33,0.0,0,0.0,0.0,102.0,0.0,N,False,0.0,0.0,0.0,,62.4074,0.0,46.8056,0.0,Sport Utility Vehicle - 2WD,-3500,,,,EV,,,67 KW AC Induction,,0.0,,N,0,0,0


### Plotly

Docs: [here](https://plotly.com/python-api-reference/plotly.express.html)

In [0]:
#Subset data to only major manufacturers
choice = ["Ford", "Nissan", "Toyota", "Chevrolet"]
df_sub = df[df["make"].isin(choice)]

df_sub.shape

(11155, 83)

In [0]:
px.scatter(df_sub, 
           x="co2",
           y="comb08",
           color="fuelType1",
           hover_name="model", 
           hover_data=["evMotor", "guzzler", "combE"], 
           title="Does higher co2 level ever indicate better MPG?"
           )

In [0]:
px.violin(df_sub,
          x="make", 
          y="comb08", 
          hover_data=["evMotor", "guzzler", "combE", "year"],
          hover_name="model",
          title="Visualizing the distribution of manufactuer models and their combined MPG")

NameError: ignored

#Data Cleaning

In [0]:

# Fill all -1 values with NaN
path = "https://raw.githubusercontent.com/mpHarm88/streetsmart/master/data/vehicles.csv"
df = pd.read_csv(path,
                 na_values="-1")
print(f"The shape of the data: {df.shape}")
print(df.columns)

The shape of the data: (42230, 83)
Index(['barrels08', 'barrelsA08', 'charge120', 'charge240', 'city08',
       'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2',
       'co2A', 'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U',
       'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'cylinders',
       'displ', 'drive', 'engId', 'eng_dscr', 'feScore', 'fuelCost08',
       'fuelCostA08', 'fuelType', 'fuelType1', 'ghgScore', 'ghgScoreA',
       'highway08', 'highway08U', 'highwayA08', 'highwayA08U', 'highwayCD',
       'highwayE', 'highwayUF', 'hlv', 'hpv', 'id', 'lv2', 'lv4', 'make',
       'model', 'mpgData', 'phevBlended', 'pv2', 'pv4', 'range', 'rangeCity',
       'rangeCityA', 'rangeHwy', 'rangeHwyA', 'trany', 'UCity', 'UCityA',
       'UHighway', 'UHighwayA', 'VClass', 'year', 'youSaveSpend', 'guzzler',
       'trans_dscr', 'tCharger', 'sCharger', 'atvType', 'fuelType2', 'rangeA',
       'evMotor', 'mfrCode', 'c240Dscr', 'charge240b', 'c240bDsc


Columns (70,71,72,73,74,76,79) have mixed types.Specify dtype option on import or set low_memory=False.



In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42230 entries, 0 to 42229
Data columns (total 83 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   barrels08        42230 non-null  float64
 1   barrelsA08       42230 non-null  float64
 2   charge120        42230 non-null  float64
 3   charge240        42230 non-null  float64
 4   city08           42230 non-null  int64  
 5   city08U          42230 non-null  float64
 6   cityA08          42230 non-null  int64  
 7   cityA08U         42230 non-null  float64
 8   cityCD           42230 non-null  float64
 9   cityE            42230 non-null  float64
 10  cityUF           42230 non-null  float64
 11  co2              10276 non-null  float64
 12  co2A             633 non-null    float64
 13  co2TailpipeAGpm  42230 non-null  float64
 14  co2TailpipeGpm   42230 non-null  float64
 15  comb08           42230 non-null  int64  
 16  comb08U          42230 non-null  float64
 17  combA08     

In [0]:
df.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,cityUF,co2,co2A,co2TailpipeAGpm,co2TailpipeGpm,comb08,comb08U,combA08,combA08U,combE,combinedCD,combinedUF,cylinders,displ,drive,engId,eng_dscr,feScore,fuelCost08,fuelCostA08,fuelType,fuelType1,ghgScore,ghgScoreA,highway08,highway08U,highwayA08,highwayA08U,highwayCD,highwayE,highwayUF,hlv,hpv,id,lv2,lv4,make,model,mpgData,phevBlended,pv2,pv4,range,rangeCity,rangeCityA,rangeHwy,rangeHwyA,trany,UCity,UCityA,UHighway,UHighwayA,VClass,year,youSaveSpend,guzzler,trans_dscr,tCharger,sCharger,atvType,fuelType2,rangeA,evMotor,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,0.0,,,0.0,423.190476,21,0.0,0,0.0,0.0,0.0,0.0,4.0,2.0,Rear-Wheel Drive,9011,(FFS),,1300,0,Regular,Regular Gasoline,,,25,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0,0,Alfa Romeo,Spider Veloce 2000,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,23.3333,0.0,35.0,0.0,Two Seaters,1985,-1500,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,0.0,,,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,12.0,4.9,Rear-Wheel Drive,22020,(GUZZLER),,2450,0,Regular,Regular Gasoline,,,14,0.0,0,0.0,0.0,0.0,0.0,0,0,10,0,0,Ferrari,Testarossa,N,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,11.0,0.0,19.0,0.0,Two Seaters,1985,-7250,T,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,0.0,,,0.0,329.148148,27,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,Front-Wheel Drive,2100,(FFS),,1000,0,Regular,Regular Gasoline,,,33,0.0,0,0.0,0.0,0.0,0.0,19,77,100,0,0,Dodge,Charger,Y,False,0,0,0,0.0,0.0,0.0,0.0,Manual 5-spd,29.0,0.0,47.0,0.0,Subcompact Cars,1985,0,,SIL,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,0.0,,,0.0,807.909091,11,0.0,0,0.0,0.0,0.0,0.0,8.0,5.2,Rear-Wheel Drive,2850,,,2450,0,Regular,Regular Gasoline,,,12,0.0,0,0.0,0.0,0.0,0.0,0,0,1000,0,0,Dodge,B150/B250 Wagon 2WD,N,False,0,0,0,0.0,0.0,0.0,0.0,Automatic 3-spd,12.2222,0.0,16.6667,0.0,Vans,1985,-7250,,,,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,0.0,,,0.0,467.736842,19,0.0,0,0.0,0.0,0.0,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,66031,"(FFS,TRBO)",,2000,0,Premium,Premium Gasoline,,,23,0.0,0,0.0,0.0,0.0,0.0,0,0,10000,0,14,Subaru,Legacy AWD Turbo,N,False,0,90,0,0.0,0.0,0.0,0.0,Manual 5-spd,21.0,0.0,32.0,0.0,Compact Cars,1993,-5000,,,T,,,,,,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


## Remove Diesel and Natural Gas from the df

Focus on gas, hybrid, electric

Subset rows to not include Diesel or Natural Gas


In [0]:
import numpy as np

# Subset data for rows that don't include diesel fuel types
df_sub = df[~(df["fuelType1"].isin(["Diesel", "Natural Gas"]))]
df_sub.shape

(40968, 83)

In [0]:
df_sub["fuelType"].value_counts()

Regular                        26913
Premium                        12027
Gasoline or E85                 1339
Electricity                      237
Premium or E85                   127
Midgrade                         118
Premium and Electricity           75
Regular Gas and Electricity       51
Premium Gas or Electricity        49
Gasoline or natural gas           20
Gasoline or propane                8
Regular Gas or Electricity         4
Name: fuelType, dtype: int64

In [0]:
df_sub["fuelType1"].value_counts()

Regular Gasoline     28335
Premium Gasoline     12278
Electricity            237
Midgrade Gasoline      118
Name: fuelType1, dtype: int64

In [0]:
df_sub["fuelType2"].value_counts()

E85            1466
Electricity     179
Natural Gas      20
Propane           8
Name: fuelType2, dtype: int64

In [0]:
dict_map = {
    "Regular Gasoline": "Gas",
    "Premium Gasoline": "Gas",
    "Midgrade Gasoline": "Gas"
}

In [0]:
def gasoline(x):
    """
    Turn all values into either Gasoline, Hybrid, or Electricity. Otherwise return np.nan
    """

    for i in x:
        if x == "Regular":
            return "Gasoline"
        elif x == "Premium":
            return "Gasoline"
        elif x == "Gasoline or E85":
            return "Hybrid"
        elif x == "Electricity":
            return "Electricity"
        elif x == "Premium or E85":
            return "Hybrid"
        elif x == "Midgrade":
            return "Gasoline"
        elif x == "Premium and Electricity":
            return "Hybrid"
        elif x == "Regular Gas and Electricity":
            return "Hybrid"
        elif x == "Premium Gas or Electricity":
            return "Hybrid"
        elif x == "Regular Gas or Electricity":
            return "Hybrid"
        else:
            return np.nan

    return x

In [0]:
# Apply functions

df_sub["car_fuel_type"] = df_sub["fuelType"].apply(gasoline)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [0]:
df_sub["car_fuel_type"].value_counts()

Gasoline       39058
Hybrid          1645
Electricity      237
Name: car_fuel_type, dtype: int64