In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
import itertools

%matplotlib inline

# How Much is Your Car Worth?

Data about the retail price of 2005 General Motors cars can be found in `car_data.csv`.

The columns are:

1. Price: suggested retail price of the used 2005 GM car in excellent condition.
2. Mileage: number of miles the car has been driven
3. Make: manufacturer of the car such as Saturn, Pontiac, and Chevrolet
4. Model: specific models for each car manufacturer such as Ion, Vibe, Cavalier
5. Trim (of car): specific type of car model such as SE Sedan 4D, Quad Coupe 2D          
6. Type: body type such as sedan, coupe, etc.      
7. Cylinder: number of cylinders in the engine        
8. Liter: a more specific measure of engine size     
9. Doors: number of doors           
10. Cruise: indicator variable representing whether the car has cruise control (1 = cruise)
11. Sound: indicator variable representing whether the car has upgraded speakers (1 = upgraded)
12. Leather: indicator variable representing whether the car has leather seats (1 = leather)

## Tasks, Part 1

1. Find the linear regression equation for mileage vs price.
2. Chart the original data and the equation on the chart.
3. Find the equation's $R^2$ score (use the `.score` method) to determine whether the
equation is a good fit for this data. (0.8 and greater is considered a strong correlation.)

## Tasks, Part 2

1. Use mileage, cylinders, liters, doors, cruise, sound, and leather to find the linear regression equation.
2. Find the equation's $R^2$ score (use the `.score` method) to determine whether the
equation is a good fit for this data. (0.8 and greater is considered a strong correlation.)
3. Find the combination of the factors that is the best predictor for price.

## Tasks, Part 3

1. Research dummy variables in scikit-learn to see how to use the make, model, and body type.
2. Find the best combination of factors to predict price.

In [None]:
df = pd.read_csv("car_data.csv")
df.head(3)

### Tasks completed for Part 1:

In [None]:
regr1 = linear_model.LinearRegression()
regr1.fit(df[['Mileage']],df[['Price']])
print("The x coefficient = {}".format(regr1.coef_))
print("  The y intercept = {}".format(regr1.intercept_))

print("\n          The Rsq = {}".format(regr1.score(df[['Mileage']],df[['Price']])))

In [None]:
plt.scatter(df.Mileage,df.Price)
plt.ylabel("Price")
plt.xlabel("Mileage")
plt.title("Price x Mileage for Used Cars")
plt.plot(df[['Mileage']],
         regr1.predict(df[['Mileage']]), 
         color='red', 
         linewidth = 2)
plt.show()
print("Mileage alone is not a good predictor of price.")
print("The Rsq at .02 is very low.  Only 2% of Price variance is captured in Mileage.")
print("The current model is insufficient to predict price.")

### Tasks completed for Part 2:

In [None]:
fulldf = df[['Price', 'Mileage', 'Cylinder', 'Liter', 'Doors','Cruise','Sound','Leather']]

input = fulldf[['Mileage', 'Cylinder', 'Liter', 'Doors','Cruise','Sound','Leather']]
output = fulldf['Price']

regrm = linear_model.LinearRegression()
regrm.fit(input, output)
print("Coefficients for Mileage, Cylinder, Liter, Doors, Cruise, Sound, leather:")
print(regrm.coef_)
print("Y-intercept = {}".format(regrm.intercept_))
print("    The Rsq = {}".format(regrm.score(input, output)))

In [None]:
fulldf.corr()

In [None]:
subdf = df[['Price','Mileage','Cylinder','Cruise']]   #Based on the highest correlation with Price.

input = fulldf[['Mileage','Cylinder','Cruise']]
output = fulldf['Price']

regrm = linear_model.LinearRegression()
regrm.fit(input, output)
print("Coefficients for Mileage, Cylinder + Cruise:")
print(regrm.coef_)
print("Y-intercept = {}".format(regrm.intercept_))
print("    The Rsq = {}".format(regrm.score(input, output)))

#### The correlation coefficient identified Cylinder number and Cruise control as moderately associated with price.  
#### Mileage was added even though the correlation coefficient is low. 
#### Will revisit with a comprehensive iterative loop but currently meets MVP.
#### Here is an attempt at an iterative loop through column combinations; [but does not align with full column model].

In [None]:
fulldf = pd.read_csv("car_data.csv")
possible_columns = ['Mileage','Cylinder','Liter','Doors','Cruise','Sound','Leather']

def high_value_regression(fulldf, possible_columns):
    
    '''
combos=[] 
   - the range "1" allows control of the minimum number of columns to be included.
   - The 'intertools' appears to create a list of tuples for each combination of columns.
   - Each tuple needs to be unpacked to a list prior to evaluation by LinearRegression. 
    '''
    combos=[]
    for x in range(1, len(possible_columns)):        
        combos.append(list(itertools.combinations(possible_columns, x)))

    '''
letter=[]
   - Each tuple from 'combo' is unpacked to a list and appended into letter.
    '''   
    letter =[]
    for x in combos:                                
        for y in x:
            letter.append(list(y))

    '''
results=[]
   - The list of column names are plugged into df[x] as input_data for regression.
   - After each loop through the list of lists:
       o The 'results' list accummulates two columns: 'Grouping' and 'Score'.
       o The list of column names are 'join'd and appended into 'Grouping'.
       o The Rsq is recorded as 'Score' in the DataFrame.
   - The table is sorted high to low and 5 top printed.
   
    '''
    results=[]
    output = fulldf['Price']

    for x in letter:
        input_data = fulldf[x]

        regrm = linear_model.LinearRegression()
        regrm.fit(input_data, output)
        regrm.coef_
        regrm.intercept_
    
        results.append([', '.join(x),regrm.score(input_data, output)])
    
    results = pd.DataFrame(results, columns=('Grouping', 'Score'))
    
    return results.sort_index(by='Score', ascending=False).head(5)

# high_value_regression(fulldf, possible_columns)

# print("Oddly, cannot discover why the initial 7 column Rsq (including 'liter) = higher than max in table!!")
# print("Rsq all 7 = 0.44626")

In [None]:
high_value_regression(fulldf, possible_columns)

### Tasks completed for Part 3:

In [None]:
dumbdf = pd.read_csv("car_data.csv")   # re-read data; dummy columns will be added to this dataframe.

dumb_list = ['Make','Model','Type']    # pull an experimental subset of data.

'''
Each loop will create new columns named by unique members of the string column.
The number of new columns will depend on the number of unique strings.
The next linear regression will include the new dummy columns.
The original source columns will not be included. 
'''
for i in dumbdf['Make'].unique():
    dumbdf[str(i)] = dumbdf['Make'] == i    

# for i in dumbdf['Model'].unique():
#     dumbdf[str(i)] = dumbdf['Type'] == i

# for i in dumbdf['Type'].unique():
#     dumbdf[str(i)] = dumbdf['Type'] == i
      

In [None]:
dumbdf.head(3)

In [None]:
dumbdf.columns

In [None]:
'''
Removed 'Make', 'Model', and 'Type' to avoid oversampling.
Kernal would not run with the additional Model data.
Removed all dummy values associated with 'Model'.
Removed all dummy values associated with 'Type'.
Removed 'Liter','Cruise','Sound'

'''

new_columns = ['Mileage','Cylinder','Doors','Leather','Buick',
               'Cadillac','Chevrolet','Pontiac','SAAB','Saturn']

high_value_regression(dumbdf, new_columns)   # return results