In [1]:
#Our basic imports 
import pandas as pd
import numpy as np 
import matplotlib as plt 
import seaborn as sns

%matplotlib inline

#For preprocessing our data
from sklearn.preprocessing import LabelEncoder

#Needed models
from sklearn.linear_model import LinearRegression

# Examine the Data

In [2]:
df = pd.read_csv('../data/auto_data.csv')

In [3]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [4]:
#Let's check for null values
print(df.isnull().sum().sum())
print(df.isnull().values.any())
#Awesome no imputation necessary

0
False


In [10]:
#Let's see our MPG columned binned
df['mpg'].value_counts(bins=7)
#14 to 20 roughly

(14.371, 19.743]    97
(19.743, 25.114]    92
(25.114, 30.486]    72
(30.486, 35.857]    53
(8.961, 14.371]     52
(35.857, 41.229]    25
(41.229, 46.6]       7
Name: mpg, dtype: int64

In [11]:
df['weight'].value_counts(bins=7)

(2116.857, 2620.714]    104
(2620.714, 3124.571]     74
(1609.472, 2116.857]     64
(3124.571, 3628.429]     59
(3628.429, 4132.286]     44
(4132.286, 4636.143]     39
(4636.143, 5140.0]       14
Name: weight, dtype: int64

In [9]:
df['horsepower']

150    22
90     20
88     19
110    18
100    17
       ..
138     1
148     1
158     1
167     1
193     1
Name: horsepower, Length: 94, dtype: int64

In [5]:
#Now let's ook at our data types 
df.dtypes
#Hmm horsepower is an object?
#Car name makes sense 

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model_year        int64
origin            int64
car_name         object
dtype: object

In [6]:
#Let's deal with car_name before we switch over our horsepower column to a numeric type
#Since we can not use categorical data in any linear regression model let's 
#Label encode our car_name

le = LabelEncoder()

df['car_name_label'] = le.fit_transform(df['car_name'])
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name,car_name_label
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu,49
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320,36
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite,231
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst,14
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino,161


In [8]:
#Now let's us a for loop to switch over the horsepower column
for column in df.columns:
    if(column !='car_name' and column !='model_year' and column!='origin'):
        df[column]=pd.to_numeric(df[column])

ValueError: Unable to parse string "?" at position 32

In [None]:
df.shape

In [None]:
#Let's check correlations between our dependant variable (mpg) and some independant variables
df.plot(x='displacement', y='mpg', kind='scatter', color='DarkGreen', title='Correlation Between Displacement and MPG')
df.plot(x='horsepower',  y='mpg', kind='scatter', color='teal', title='Correlation Between Horsepower and MPG')
df.plot(x='weight', y='mpg',  kind='scatter', color='indigo', title='Correlation Between Weight and MPG')
plt.show()

In [None]:
sns.pairplot(df)

In [None]:
#Now let's ook at our data types 
df.dtypes
#Hmm horsepower is an object?
#Car name makes sense 

In [None]:
#Let's take a look at value counts for our cylinders 
df['cylinders'].value_counts()

In [None]:
df['horsepower'].value_counts()

In [None]:
df['car_name'].value_counts()

In [None]:
#Since we can not use categorical data in any linear regression model let's 
#Label encode our car_name

le = LabelEncoder()

df['car_name_label'] = le.fit_transform(df['car_name'])
df.head()

In [None]:
df['car_name_label'].value_counts()