In [3]:
import sys
import numpy as np
from sklearn import tree

MAKER = 0
MODEL = 1
TRANS = 6
FUEL  = 9

# function: getData
def getData(fp):
    return [ instance.split(',')[:] for instance in fp.read().split('\n')[:-1] ]

# function: getMaker
def getMaker(val, maker, maker_val):
    if val not in maker:
        return float(float(len(maker))/2.0)
    # return maker_val[maker.index(val)]
    return float(maker.index(val))

# function: getModel
def getModel(val, model, model_val):
    if val not in model:
        return float(float(len(model))/2.0)
    # return model_val[model.index(val)]
    return float(model.index(val))
    
# function: getTransmission
def getTransmission(val, trans, trans_val):
    if val not in trans:
        return float(float(len(trans))/2.0)
    # return trans_val[trans.index(val)]
    return float(trans.index(val))
    
# function: getFuelType
def getFuelType(val, fuel, fuel_val):
    if val not in fuel:
        return float(float(len(fuel))/2.0)
    # return fuel_val[fuel.index(val)]
    return float(fuel.index(val))
    
# function: makerTransFuelVal
def makerTransFuelVal(train_data):
    # list all possible value of maker, transmission, fuel type
    list_maker = list(set([ now[MAKER] for now in train_data ]))
    list_model = list(set([ now[MODEL] for now in train_data ]))
    list_trans = list(set([ now[TRANS] for now in train_data ]))
    list_fuel  = list(set([ now[FUEL]  for now in train_data ]))

    # calculate average of price corresponding to maker, transmission, fuel type
    num_maker = [ 0.0 for i in list_maker ]
    num_model = [ 0.0 for i in list_model ]
    num_trans = [ 0.0 for i in list_trans ]
    num_fuel  = [ 0.0 for i in list_fuel  ]
    avg_maker = [ 0.0 for i in list_maker ]
    avg_model = [ 0.0 for i in list_model ]
    avg_trans = [ 0.0 for i in list_trans ]
    avg_fuel  = [ 0.0 for i in list_fuel  ]
    
    for data in train_data:
        maker_id = list_maker.index(data[MAKER])
        model_id = list_model.index(data[MODEL])
        trans_id = list_trans.index(data[TRANS])
        fuel_id  = list_fuel.index(data[FUEL])
        num_maker[maker_id] = num_maker[maker_id] + 1.0
        num_model[model_id] = num_model[model_id] + 1.0
        num_trans[trans_id] = num_trans[trans_id] + 1.0
        num_fuel[fuel_id] = num_fuel[fuel_id] + 1.0
        avg_maker[maker_id] = avg_maker[maker_id] + float(data[len(data)-1])
        avg_model[model_id] = avg_model[model_id] + float(data[len(data)-1])
        avg_trans[trans_id] = avg_trans[trans_id] + float(data[len(data)-1])
        avg_fuel[fuel_id]  = avg_fuel[fuel_id] + float(data[len(data)-1])
    
    for i in range(len(avg_maker)):
        avg_maker[i] = avg_maker[i]/num_maker[i]
    for i in range(len(avg_model)):
        avg_model[i] = avg_model[i]/num_model[i]
    for i in range(len(avg_trans)):
        avg_trans[i] = avg_trans[i]/num_trans[i]
    for i in range(len(avg_fuel)):
        avg_fuel[i] = avg_fuel[i]/num_fuel[i]
    
    # the value of maker, transmission, fuel type is the corresponding average price
    maker = [ [i, avg_maker[i]] for i in range(len(avg_maker)) ]
    model = [ [i, avg_model[i]] for i in range(len(avg_model)) ]
    trans = [ [i, avg_trans[i]] for i in range(len(avg_trans)) ]
    fuel  = [ [i, avg_fuel[i]] for i in range(len(avg_fuel) ) ]
    
    maker = sorted(maker, key=lambda x: x[1])
    model = sorted(model, key=lambda x: x[1])
    trans = sorted(trans, key=lambda x: x[1])
    fuel  = sorted(fuel , key=lambda x: x[1])
    
    list_maker = [ list_maker[maker[i][0]] for i in range(len(maker)) ]
    maker_val = [ maker[i][1] for i in range(len(maker)) ]
    list_model = [ list_model[model[i][0]] for i in range(len(model)) ]
    model_val = [ model[i][1] for i in range(len(model)) ]
    list_trans = [ list_trans[trans[i][0]] for i in range(len(trans)) ]
    trans_val = [ trans[i][1] for i in range(len(trans)) ]
    list_fuel  = [ list_fuel[fuel[i][0]]   for i in range(len(fuel) ) ]
    fuel_val  = [ fuel[i][1]  for i in range(len(fuel) ) ]
    
    return list_maker, maker_val, list_model, model_val, list_trans, trans_val, list_fuel, fuel_val

# function: preprocessing
def preprocessing(origin_data, list_maker, maker_val, list_model, model_val, list_trans, trans_val, list_fuel, fuel_val):
    target = []
    for data in origin_data:
        for i in range(len(data)):
            if i == MAKER:
                data[MAKER] = getMaker(data[MAKER], list_maker, maker_val)
            elif i == MODEL:
                data[MODEL] = getModel(data[MODEL], list_model, model_val)
            elif i == TRANS:    
                data[TRANS] = getTransmission(data[TRANS], list_trans, trans_val)
            elif i == FUEL:    
                data[FUEL] = getFuelType(data[FUEL], list_fuel, fuel_val)
            elif i == 10:
                target.append(float(data[i]))
                data.pop()
            else:
                data[i] = float(data[i])
    return origin_data, target

# function: cov_price
def cov_price(data, price):
    label = ['maker', 'model', 'mileage', 'manufacture_year', 'engine_displacement', 'engine_power',\
             'transmission', 'door_count', 'seat_count', 'fuel_type']
    for i in range(len(label)):
        col_i = [ row[i] for row in train_data ]
        print("Correlation coefficient", label[i], "price:", np.corrcoef(col_i, price)[0][1])

# function: accuracy
def accuracy(test, predict, error_range):
    accu = 0
    for i in range(len(test)):
        if abs(test[i]-predict[i]) <= error_range:
            accu = accu + 1
    accu = float(accu)/float(len(test))
    print('There is', accu, 'that the predict price is within', error_range, 'eur of actual price\n')
        
# function: decisionTree
def decisionTree(train_data, train_target, test_data, test_target):
    clf = tree.DecisionTreeRegressor()
    clf = clf.fit(train_data, train_target)
    predict = clf.predict(test_data)
    accuracy(test_target, predict, 1000)
    
if __name__ == '__main__':
    # open filie
    try:
        fp_train = open("train.csv", "r")
    except(IOError):
        print('Error: "train.csv" not found!!\n')
        sys.exit()
    try:
        fp_test = open("test.csv", "r")
    except(IOError):
        print('Error: "test.csv", not found!!\n')
        sys.exit()
    
    # get training data
    train_data = getData(fp_train)
    list_maker, maker_val, list_model, model_val, list_trans, trans_val, list_fuel, fuel_val = makerTransFuelVal(train_data)
    train_data, train_target = preprocessing(train_data, list_maker, maker_val, list_model, model_val, list_trans, trans_val, list_fuel, fuel_val)
    # get testing data
    test_data = getData(fp_test)
    test_data, test_target = preprocessing(test_data, list_maker, maker_val, list_model, model_val, list_trans, trans_val, list_fuel, fuel_val)
    
    # decision tree
    print(train_data[0])
    print(test_data[0])
    decisionTree(train_data, train_target, test_data, test_target)
    
    # calculate correlation coefficient between price and feature
    cov_price(train_data, train_target)
    

[22.0, 519.0, 220000.0, 2003.0, 1396.0, 66.0, 0.0, 2.0, 5.0, 3.0]
[19.0, 484.0, 218000.0, 2001.0, 973.0, 43.0, 0.0, 2.0, 5.0, 3.0]
There is 0.5757944766712424 that the predict price is within 1000 eur of actual price

Correlation coefficient maker price: 0.139980100109
Correlation coefficient model price: 0.160801532351
Correlation coefficient mileage price: -0.103459401042
Correlation coefficient manufacture_year price: 0.10861649809
Correlation coefficient engine_displacement price: 0.130506951953
Correlation coefficient engine_power price: 0.191493812909
Correlation coefficient transmission price: 0.110734227529
Correlation coefficient door_count price: -0.0551736306567
Correlation coefficient seat_count price: -0.0307518213302
Correlation coefficient fuel_type price: 0.0326543028783


In [18]:
import sys
import numpy as np
from sklearn import tree

MAKER = 0
MODEL = 1
TRANS = 6
FUEL  = 9

discrete_feature_name = ['maker', 'model', 'trans', 'fuel']
discrete_feature_pos = [ MAKER, MODEL, TRANS, FUEL ]

# function: getData
def getData(fp):
    return [ instance.split(',')[:] for instance in fp.read().split('\n')[:-1] ]

# function: getListFeature
def getListFeature(train_data, features):
    return [ list(set([ now[i] for now in train_data ])) for i in features ]

# function: getFeatureAvgPrice
def getFeatureAvgPrice(train_data, list_feature, features):
    # calculate average of price corresponding to maker, model, transmission, fuel type
    num_feature = [ [ 0.0 for data in list_feature[i] ] for i in range(len(list_feature)) ]
    avg_feature = [ [ 0.0 for data in list_feature[i] ] for i in range(len(list_feature)) ]
    
    for data in train_data:
        feature_id = [ list_feature[discrete_feature_pos.index(i)].index(data[i]) for i in features ]
        for i in range(len(num_feature)):
            num_feature[i][feature_id[i]] = num_feature[i][feature_id[i]] + 1.0
        for i in range(len(avg_feature)):
            avg_feature[i][feature_id[i]] = avg_feature[i][feature_id[i]] + float(data[len(data)-1])
    
    for i in range(len(avg_feature)):
        for j in range(len(avg_feature[i])):
            avg_feature[i][j] = avg_feature[i][j] / num_feature[i][j]
    
    return avg_feature

# function: sortFeatureAvgPrice
def sortFeatureAvgPrice(avg):
    val = []
    for j in range(len(avg)):
        val.append([ [i, avg[j][i]] for i in range(len(avg[j])) ])
    for i in range(len(val)):
        val[i] = sorted(val[i], key=lambda x: x[1])
    return val
    
# function: buildDiscreteFeatureVal
def buildDiscreteFeatureVal(train_data):
    # list all possible value of each feature
    list_feature = getListFeature(train_data, discrete_feature_pos)
    
    # calculate the average price for the corresponding feature
    avg_feature = getFeatureAvgPrice(train_data, list_feature, discrete_feature_pos)
        
    # the value of maker, transmission, fuel type is the corresponding average price
    tmp_val = sortFeatureAvgPrice(avg_feature)
        
    list_feature = [ [ list_feature[j][tmp_val[j][i][0]] for i in range(len(tmp_val[j])) ] for j in range(len(tmp_val)) ]
    val_feature = [ [ tmp_val[j][i][1] for i in range(len(tmp_val[j])) ] for j in range(len(tmp_val)) ]
    
    return list_feature, val_feature

# function: getDiscreteFeatureVal
def getDiscreteFeatureVal(val, list_feature, val_feature):
    if val not in list_feature:
        return float(float(len(list_feature))/2.0)
    return float(list_feature.index(val))

# function: preprocessing
def preprocessing(origin_data, list_feature, val_feature, discrete_features):
    target = []
    for data in origin_data:
        for i in range(len(data)):
            if i in discrete_features:
                feature_id = discrete_features.index(i)
                data[i] = getDiscreteFeatureVal(data[i], list_feature[feature_id], val_feature[feature_id])
            elif i == len(data)-1:
                target.append(float(data[i]))
                data.pop()
            else:
                data[i] = float(data[i])
    return origin_data, target

# function: cov_price
def cov_price(data, price):
    label = ['maker', 'model', 'mileage', 'manufacture_year', 'engine_displacement', 'engine_power',\
             'transmission', 'door_count', 'seat_count', 'fuel_type']
    for i in range(len(label)):
        col_i = [ row[i] for row in train_data ]
        print("Correlation coefficient", label[i], "price:", np.corrcoef(col_i, price)[0][1])

# function: accuracy
def accuracy(test, predict, error_range):
    accu = 0
    for i in range(len(test)):
        if abs(test[i]-predict[i]) <= error_range:
            accu = accu + 1
    accu = float(accu)/float(len(test))
    print('There is', accu, 'that the predict price is within', error_range, 'eur of actual price\n')
        
# function: decisionTree
def decisionTree(train_data, train_target, test_data, test_target):
    clf = tree.DecisionTreeRegressor()
    clf = clf.fit(train_data, train_target)
    predict = clf.predict(test_data)
    accuracy(test_target, predict, 1000)
    
if __name__ == '__main__':
    # open filie
    try:
        fp_train = open("train.csv", "r")
    except(IOError):
        print('Error: "train.csv" not found!!\n')
        sys.exit()
    try:
        fp_test = open("test.csv", "r")
    except(IOError):
        print('Error: "test.csv", not found!!\n')
        sys.exit()
    
    # get training data
    train_data = getData(fp_train)
    list_feature, val_feature = buildDiscreteFeatureVal(train_data)
    train_data, train_target = preprocessing(train_data, list_feature, val_feature, discrete_feature_pos)
    # get testing data
    test_data = getData(fp_test)
    test_data, test_target = preprocessing(test_data, list_feature, val_feature, discrete_feature_pos)
    
    # decision tree
    decisionTree(train_data, train_target, test_data, test_target)
    
    # calculate correlation coefficient between price and feature
    cov_price(train_data, train_target)

There is 0.5763479903762417 that the predict price is within 1000 eur of actual price

Correlation coefficient maker price: 0.139980036187
Correlation coefficient model price: 0.160782252636
Correlation coefficient mileage price: -0.103459401042
Correlation coefficient manufacture_year price: 0.10861649809
Correlation coefficient engine_displacement price: 0.130506951953
Correlation coefficient engine_power price: 0.191493812909
Correlation coefficient transmission price: 0.110734227529
Correlation coefficient door_count price: -0.0551736306567
Correlation coefficient seat_count price: -0.0307518213302
Correlation coefficient fuel_type price: 0.0326543028783
