In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('./data/cars_train.csv')

In [3]:
test = pd.read_csv('./data/cars_test.csv')

In [4]:
def get_transmission_values(s):
    if s =='automatic':
        return 1
    elif s =='manual':
        return 0
    else:
        return np.NaN

In [5]:
def cylinders_to_string(s):
    if isinstance(s, str) and 'cylinders' in s:
        return int(s.split(' ')[0])
    else:
        return np.nan

In [6]:
y = train['price']
X = train.drop('price',axis=1)
X_test = test


In [7]:
test.head()

Unnamed: 0,Id,city,year,manufacturer,make,condition,cylinders,fuel,odometer,title_status,...,type,paint_color,lat,long,county_fips,county_name,state_fips,state_code,state_name,weather
0,974298,duluth,2006.0,ford,f-250 super duty,good,8 cylinders,gas,154400.0,clean,...,pickup,white,47.746524,-90.357742,27031.0,Cook,27.0,MN,Minnesota,43.0
1,1051884,kansascity,1987.0,chevrolet,,,,gas,,clean,...,,,38.373182,-93.776859,29083.0,Henry/Rives,29.0,MO,Missouri,52.0
2,684464,palmsprings,2010.0,jeep,liberty sport,,6 cylinders,gas,127722.0,clean,...,SUV,,33.741059,-116.356434,6065.0,Riverside,6.0,CA,California,59.0
3,1255387,sanmarcos,2003.0,chevrolet,tahoe,fair,8 cylinders,gas,,clean,...,SUV,white,30.026266,-98.133363,48209.0,Hays,48.0,TX,Texas,67.0
4,1195520,tampa,2006.0,lexus,gs 300,,,gas,,clean,...,,,27.8688,-82.7344,12103.0,Pinellas,12.0,FL,Florida,65.0


In [8]:
to_drop = ['Id','county_fips', 'county_name', 'state_fips', 
           'state_code','title_status', 'state_name', 'city', 'manufacturer', 
           'make', 'paint_color', 'drive', 'condition']

In [9]:
X.drop(to_drop, axis=1, inplace=True)

In [10]:
X['cylinders'] = X['cylinders'].apply(cylinders_to_string)
X['transmission'] = X['transmission'].apply(get_transmission_values)

In [11]:
X = pd.get_dummies(X, ['fuel', 'size', 'type'])
X.columns

Index(['year', 'cylinders', 'odometer', 'transmission', 'lat', 'long',
       'weather', 'fuel_diesel', 'fuel_electric', 'fuel_gas', 'fuel_hybrid',
       'fuel_other', 'size_compact', 'size_full-size', 'size_mid-size',
       'size_sub-compact', 'type_SUV', 'type_bus', 'type_convertible',
       'type_coupe', 'type_hatchback', 'type_mini-van', 'type_offroad',
       'type_other', 'type_pickup', 'type_sedan', 'type_truck', 'type_van',
       'type_wagon'],
      dtype='object')

In [12]:
X_test.drop(to_drop, axis=1, inplace=True)
X_test['cylinders'] = X_test['cylinders'].apply(cylinders_to_string)
X_test['transmission'] = X_test['transmission'].apply(get_transmission_values)
X_test = pd.get_dummies(X_test, ['fuel', 'size', 'type'])

In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
scaler.mean_
X_transform = pd.DataFrame(scaler.transform(X))
X_test_transform = pd.DataFrame(scaler.transform(X_test))

In [14]:
X_transform
cols = [0, 1, 2, 3, 6]

In [15]:
def fillna_with_mean(df, cols):
    for col in cols:
        df[col] = df[col].fillna(0, inplace=True)

In [16]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()


#regressor.fit(X_transform, y)
X_transform.head()
X_transform = X_transform.fillna(0)
regressor.fit(X_transform, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
X_transform.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0.094672,-1.27464,0.545504,0.370931,0.085072,1.006826,0.283505,-0.275509,-0.038009,0.353072,...,-0.210247,-0.150732,-0.120412,-0.054385,-0.115583,-0.27768,-0.432125,-0.290215,-0.122271,-0.12254
1,1.099313,0.0,0.0,0.370931,-0.177302,0.26504,-0.741734,-0.275509,-0.038009,0.353072,...,-0.210247,-0.150732,-0.120412,-0.054385,-0.115583,-0.27768,-0.432125,-0.290215,-0.122271,-0.12254
2,-0.658809,0.0,0.0,0.370931,0.787688,0.609267,-1.126198,-0.275509,-0.038009,0.353072,...,-0.210247,-0.150732,-0.120412,-0.054385,-0.115583,3.60127,-0.432125,-0.290215,-0.122271,-0.12254
3,-0.156488,1.195472,0.0,0.370931,-1.829894,-0.35191,1.693209,-0.275509,-0.038009,0.353072,...,-0.210247,-0.150732,-0.120412,-0.054385,-0.115583,-0.27768,-0.432125,-0.290215,-0.122271,-0.12254
4,-0.407649,0.0,0.0,0.370931,-1.609726,0.718078,1.436899,-0.275509,-0.038009,0.353072,...,-0.210247,-0.150732,-0.120412,-0.054385,-0.115583,-0.27768,-0.432125,-0.290215,-0.122271,-0.12254


In [18]:
X_test_transform = X_test_transform.fillna(0)

In [19]:
y_pred = regressor.predict(X_test_transform)

In [20]:
sub = pd.read_csv('./data/cars_sample_submission.csv.zip')

In [21]:
sub['price'] = y_pred

In [22]:
sub.to_csv('output_data/prueba8.csv', index=False)

In [24]:
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor(max_iter=1000, tol=1e-3)
sgd.fit(X_transform, y)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [25]:
y_pred_sgd = sgd.predict(X_test_transform)

In [None]:
sub['price'] = y_pred_sgd

In [None]:
sub.to_csv('output_data/prueba9.csv', index=False)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
regr = RandomForestRegressor(max_depth=5, random_state=42,
                             n_estimators=100)
regr.fit(X_transform, y)

In [None]:
y_pred_random = regr.predict(X_test_transform)

In [None]:
len(X_transform.columns)

In [None]:
sub['price'] = y_pred_random
sub.to_csv('output_data/prueba11.csv', index=False)

In [26]:
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression

In [27]:
X, y = make_regression(n_features=29, random_state=42)
elasti = ElasticNet(random_state=42)
elasti.fit(X_transform, y) 
y_pred_elasti = elasti.predict(X_test_transform)

ValueError: Found input variables with inconsistent numbers of samples: [469992, 100]

In [None]:
sub['price'] = y_pred_elasti

In [None]:
sub.to_csv('output_data/prueba12.csv', index=False)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=3)
neigh.fit(X_transform, y)
y_pred_kneig = neigh.predict(X_test_transform)

In [None]:
sub['price'] = y_pred_kneig

In [None]:
sub.to_csv('output_data/prueba10.csv', index=False)