In [46]:
import pandas as pd
import seaborn as sbs
import numpy as np
import calendar
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

In [2]:
train  = pd.read_csv('tp_bike/trip_train.csv')
test = pd.read_csv('tp_bike/trip_test.csv')
trip = pd.read_csv('tp_bike/trip.csv')

In [3]:
del train['end_date']
del train['end_station_name']
del train['end_station_id']
del test['end_date']
del test['end_station_name']
del test['end_station_id']

In [4]:
train.head(3)

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,bike_id,subscription_type,zip_code
0,907649,396,8/27/2015 8:36,Harry Bridges Plaza (Ferry Building),50,187,Subscriber,94602
1,384043,636,7/28/2014 22:06,Market at 10th,67,417,Subscriber,94133
2,316176,334,6/9/2014 8:42,Market at Sansome,77,281,Subscriber,94107


In [5]:
test.head(3)

Unnamed: 0,id,start_date,start_station_name,start_station_id,bike_id,subscription_type,zip_code
0,504737,10/18/2014 11:25,Embarcadero at Sansome,60,426,Customer,77009
1,530846,11/5/2014 13:00,Embarcadero at Folsom,51,454,Subscriber,94132
2,813140,6/18/2015 17:34,San Francisco Caltrain (Townsend at 4th),70,370,Subscriber,94107


In [6]:
fechaYhora = train['start_date'].str.split(' ')
fecha = fechaYhora.map(lambda x: x[0])
hora = fechaYhora.map(lambda x: x[1])
train = train.assign(date_start = fecha, time_start = hora)

In [7]:
train['date_start'] = pd.to_datetime(train['date_start'])
train['weekday'] = train['date_start'].map(lambda x: x.weekday_name)
train['hour_start'] = pd.to_datetime(train['time_start'], format = '%H:%M', errors = 'coerce').dt.hour

In [8]:
#idem para el test
fechaYhora = test['start_date'].str.split(' ')
fecha = fechaYhora.map(lambda x: x[0])
hora = fechaYhora.map(lambda x: x[1])
test = test.assign(date_start = fecha, time_start = hora)

test['date_start'] = pd.to_datetime(test['date_start'])
test['weekday'] = test['date_start'].map(lambda x: x.weekday_name)
test['hour_start'] = pd.to_datetime(test['time_start'], format = '%H:%M', errors = 'coerce').dt.hour

In [9]:
train['subscription_type'] = train.subscription_type.map({'Subscriber':1, 'Customer':0})
train['weekday'] = train.weekday.map({'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4,\
                                      'Friday': 5, 'Saturday': 6, 'Sunday': 7})

In [10]:
test['weekday'] = test.weekday.map({'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7})
test['subscription_type'] = test.subscription_type.map({'Subscriber':1, 'Customer':0})

In [11]:
train['month_start'] = train['date_start'].map(lambda x: x.month)
test['month_start'] = test['date_start'].map(lambda x: x.month)

In [12]:
#cambio zip code a numerico
train.zip_code = train.zip_code.str.replace('nil','')
train.zip_code = pd.to_numeric(train.zip_code,errors='coerce')
test.zip_code = test.zip_code.str.replace('nil','')
test.zip_code = pd.to_numeric(test.zip_code,errors='coerce')


In [13]:
train.dtypes

id                             int64
duration                       int64
start_date                    object
start_station_name            object
start_station_id               int64
bike_id                        int64
subscription_type              int64
zip_code                     float64
date_start            datetime64[ns]
time_start                    object
weekday                        int64
hour_start                     int64
month_start                    int64
dtype: object

In [14]:
cols = ['id', 'start_station_id', 'bike_id', 'subscription_type', 'month_start', 'weekday', 'hour_start']
features = train[cols]
duration = train.duration

In [15]:
del train['duration']

In [16]:
test1 = test[cols]

In [17]:
features.head(3)

Unnamed: 0,id,start_station_id,bike_id,subscription_type,month_start,weekday,hour_start
0,907649,50,187,1,8,4,8
1,384043,67,417,1,7,1,22
2,316176,77,281,1,6,1,8


# Linear Regression

In [18]:
x_train, x_test, y_train, y_test = train_test_split(features, duration, test_size = 0.6, random_state = 0)

reg = linear_model.LinearRegression()

reg.fit(x_train, y_train) #entreno el algoritmo

#prediccion

pred = reg.predict(x_test)

score = mean_squared_error(pred,y_test)
score

#calculo el error de mi prediccion

#error = (pred - y_test) ** 2

#otra forma mejor
# error = no.sqrt(metrics.mean_squared_error(pred, y_test))

#para que servia reg.score(x_test, y_test) 

948149745.27316415

In [19]:
prediccion1 = reg.predict(test1)

In [20]:
tripIdDuration = trip[['id','duration']]

In [21]:
target = test[['id']]

In [22]:
testTarget = pd.merge(tripIdDuration,target, how = "inner", on = "id")

In [23]:
testTarget.head(3)

Unnamed: 0,id,duration
0,4130,71
1,4498,126
2,4557,130


In [24]:
duration_real = testTarget['duration']

In [25]:
score_test = mean_squared_error(prediccion1,duration_real)
score_test

44298018.733980663

# Codigo para sacar afuera el csv para submit

In [26]:
ids = test.id

In [27]:
data = {'id': ids, 'duration': prediccion1}
submit1 = pd.DataFrame(data)

In [28]:
submit1.head(3)

Unnamed: 0,duration,id
0,4004.673128,504737
1,636.435005,530846
2,543.025317,813140


In [29]:
#submit1.to_csv('tp_bike/submit1.csv',index=False)

# Agregado de merge

In [30]:
train.head(3)

Unnamed: 0,id,start_date,start_station_name,start_station_id,bike_id,subscription_type,zip_code,date_start,time_start,weekday,hour_start,month_start
0,907649,8/27/2015 8:36,Harry Bridges Plaza (Ferry Building),50,187,1,94602.0,2015-08-27,8:36,4,8,8
1,384043,7/28/2014 22:06,Market at 10th,67,417,1,94133.0,2014-07-28,22:06,1,22,7
2,316176,6/9/2014 8:42,Market at Sansome,77,281,1,94107.0,2014-06-09,8:42,1,8,6


In [31]:
train.loc[train.zip_code.isnull(),'zip_code'] = 0

In [32]:
test.loc[test.zip_code.isnull(),'zip_code'] = 0

In [33]:
train['zip_code'].isnull()

0         False
1         False
2         False
3         False
4         False
5         False
6         False
7         False
8         False
9         False
10        False
11        False
12        False
13        False
14        False
15        False
16        False
17        False
18        False
19        False
20        False
21        False
22        False
23        False
24        False
25        False
26        False
27        False
28        False
29        False
          ...  
549931    False
549932    False
549933    False
549934    False
549935    False
549936    False
549937    False
549938    False
549939    False
549940    False
549941    False
549942    False
549943    False
549944    False
549945    False
549946    False
549947    False
549948    False
549949    False
549950    False
549951    False
549952    False
549953    False
549954    False
549955    False
549956    False
549957    False
549958    False
549959    False
549960    False
Name: zip_code, dtype: b

In [34]:
train.dtypes

id                             int64
start_date                    object
start_station_name            object
start_station_id               int64
bike_id                        int64
subscription_type              int64
zip_code                     float64
date_start            datetime64[ns]
time_start                    object
weekday                        int64
hour_start                     int64
month_start                    int64
dtype: object

In [35]:
cols = ['id', 'start_station_id', 'bike_id', 'subscription_type', 'month_start', 'weekday', 'hour_start','zip_code']
features = train[cols]

In [36]:
test2 = test[cols]

In [37]:
x_train, x_test, y_train, y_test = train_test_split(features, duration, test_size = 0.6, random_state = 0)

reg = linear_model.LinearRegression()

reg.fit(x_train, y_train)
pred = reg.predict(test2)

In [38]:
score_test = mean_squared_error(pred,duration_real)
score_test

44891332.068091244

In [39]:
cols = ['id', 'start_station_id', 'bike_id', 'subscription_type', 'month_start', 'weekday', 'hour_start']
features = train[cols]

In [40]:
test2 = test[cols]

# SVR

In [41]:
clf = SVR(C=1.0, epsilon=0.2)
x_train, x_test, y_train, y_test = train_test_split(features, duration, test_size = 0.6, random_state = 0)
clf.fit(x_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [42]:
pred2 = clf.predict(test2)

In [44]:
mean_squared_error(pred2,duration_real)

43110036.489661917

# Random Forest Regressor

In [55]:
rfr = RandomForestRegressor(n_estimators = 100,
                            min_samples_leaf = 2,
                            random_state = 0)

In [56]:
x_train, x_test, y_train, y_test = train_test_split(features, duration, test_size = 0.6, random_state = 0)
rfr.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [57]:
pred3 = rfr.predict(test2)

In [58]:
mean_squared_error(pred3,duration_real)

49135023.135949716

# Gradient Bossting Regressor

In [59]:
gbr = GradientBoostingRegressor()

In [60]:
x_train, x_test, y_train, y_test = train_test_split(features, duration, test_size = 0.6, random_state = 0)
gbr.fit(x_train,y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [67]:
pred4 = gbr.predict(test2)

In [68]:
mean_squared_error(pred4,duration_real)

44807225.158037588

# Ada Boost Regressor

In [63]:
abr = AdaBoostRegressor()

In [64]:
x_train, x_test, y_train, y_test = train_test_split(features, duration, test_size = 0.6, random_state = 0)
abr.fit(x_train,y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None)

In [69]:
pred5 = abr.predict(test2)

In [70]:
mean_squared_error(pred5,duration_real)

67109665.307293624