In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


In [2]:
def processData(inX, hasGT=False):
    X = inX.copy()
    features = ['fuel','condition','year','manufacturer','drive','transmission']
    if hasGT:
        features.append('price')
    X = X[features]
    X.loc[X['condition'].isnull(), 'condition'] = 'unknown' 
    X.loc[X['fuel'].isnull(), 'fuel'] = 'unknown' 
    X.loc[X['drive'].isnull(), 'drive'] = 'unknown' 
    X.loc[X['manufacturer'].isnull(), 'manufacturer'] = 'unknown' 
    X.loc[X['transmission'].isnull(), 'transmission'] = 'unknown' 

    #X = X[~X['year'].isnull()]
    X.loc[X['year'].isnull(), 'year'] = 2015
    X = pd.get_dummies(X,columns=['fuel','condition','drive','manufacturer','transmission'])
    
    y = None
    if hasGT:
        y = X['price']
        X = X.drop(columns=['price'],axis=1)
    else:
        y = inX['Id']
    if 'manufacturer_mercedesbenz' in X.columns:
        X=X.drop(columns=['manufacturer_mercedesbenz'])
    return X, y

In [3]:
X_submission, submission_id =  processData(pd.read_csv('datamad0819-vehicles/cars_test.csv'))
X, y =  processData(pd.read_csv('datamad0819-vehicles/cars_train.csv'), True)

In [4]:


pca = PCA(n_components=15)
X_train = pca.fit_transform(X) 
X_train = pd.DataFrame(X_train)
X_train_submission = pca.transform(X_submission)
X_train_submission = pd.DataFrame(X_train_submission)

In [5]:
display(X_train.head())
display(X_train_submission.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-1.091937,0.09079,-0.32296,0.831695,0.077457,-0.688283,-0.170313,-0.421593,0.154908,-0.275919,0.067604,-0.731313,0.331139,0.046045,0.025239
1,-13.092163,1.008584,-0.11115,0.001858,-0.0951,-0.180484,-0.109884,0.097441,-0.089223,-0.177351,-0.094368,0.065361,-0.219941,-0.20044,0.31074
2,7.910618,0.309256,-0.005693,0.128414,-0.192782,-0.453949,0.557051,-0.386116,-0.535707,0.246607,0.576931,0.063922,-0.008089,0.008852,0.898043
3,1.905043,0.974851,0.00851,0.094701,-0.22598,-0.042879,-0.101616,-0.016582,0.000954,-0.174129,-0.077267,0.09897,-0.164073,-0.038419,-0.00773
4,4.905345,0.968186,-0.066287,0.106624,-0.279883,-0.037339,-0.117816,-0.001942,-0.063309,-0.193101,-0.083077,0.113524,-0.243158,-0.216776,0.246621


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-1.093837,-0.711133,0.775635,-0.69468,-0.21518,-0.269109,0.319098,-0.344702,-0.601206,0.244786,-0.103286,-0.132742,0.080873,0.023403,-0.018968
1,17.905022,0.936036,0.01522,0.201449,-0.51395,0.043754,-0.355714,0.241018,0.247481,0.671653,0.051876,-0.009237,0.048637,-0.012716,-0.061193
2,-5.089889,0.250975,0.051589,-0.092061,-0.077864,0.119405,0.215673,1.063,-0.043208,-0.478543,-0.261806,0.023392,-0.191711,-0.044146,-0.065357
3,1.914902,-0.39602,-0.111142,-0.007676,-0.229848,-0.288241,0.022869,1.14915,0.359639,0.376486,0.446255,0.041926,0.003288,0.081948,0.882145
4,-1.093824,0.980742,-0.064238,0.085278,-0.216867,-0.080873,-0.109422,0.020886,-0.095366,-0.119093,-0.067223,0.030953,-0.076746,-0.031761,-0.00564


In [6]:
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

models = {
#    "modelSVR": SVR(kernel="linear"),
    "modelLinearRG": LinearRegression(),
#    "decisionTree": DecisionTreeRegressor(random_state=0)
#    'randomForest': RandomForestRegressor(max_depth=3, random_state=0,n_estimators=50)
}

preds = {}

for label, model in models.items():
    print("Training {}".format(label))
    model.fit(X_train,y)
    preds[label] = model.predict(X_train_submission)

Training modelLinearRG


In [10]:
for label, pred in preds.items():
    submission = pd.DataFrame({
        'Id': submission_id,
        'price': pred
    })
#    submission = submission.abs()
    display(submission.head())

#    submission.to_csv('submission_{}.csv'.format(label), index=False)

Unnamed: 0,Id,price
0,974298.0,36490.136378
1,1051884.0,362796.256052
2,684464.0,57653.712738
3,1255387.0,49863.621877
4,1195520.0,188365.144035
