In [1]:
import pandas as pd
import seaborn as sbs
import numpy as np
import calendar
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics

In [2]:
train  = pd.read_csv('tp_bike/trip_train.csv')
test = pd.read_csv('tp_bike/trip_test.csv')
trip = pd.read_csv('tp_bike/trip.csv')
weather = pd.read_csv('tp_bike/weather.csv')
station = pd.read_csv('tp_bike/station.csv')

In [3]:
del train['end_date']
del train['end_station_name']
del train['end_station_id']
del test['end_date']
del test['end_station_name']
del test['end_station_id']

In [4]:
train.head(3)

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,bike_id,subscription_type,zip_code
0,907649,396,8/27/2015 8:36,Harry Bridges Plaza (Ferry Building),50,187,Subscriber,94602
1,384043,636,7/28/2014 22:06,Market at 10th,67,417,Subscriber,94133
2,316176,334,6/9/2014 8:42,Market at Sansome,77,281,Subscriber,94107


In [5]:
test.head(3)

Unnamed: 0,id,start_date,start_station_name,start_station_id,bike_id,subscription_type,zip_code
0,504737,10/18/2014 11:25,Embarcadero at Sansome,60,426,Customer,77009
1,530846,11/5/2014 13:00,Embarcadero at Folsom,51,454,Subscriber,94132
2,813140,6/18/2015 17:34,San Francisco Caltrain (Townsend at 4th),70,370,Subscriber,94107


In [6]:
fechaYhora = train['start_date'].str.split(' ')
fecha = fechaYhora.map(lambda x: x[0])
hora = fechaYhora.map(lambda x: x[1])
train = train.assign(date_start = fecha, time_start = hora)

In [7]:
train['date_start'] = pd.to_datetime(train['date_start'])
train['weekday'] = train['date_start'].map(lambda x: x.weekday_name)
train['hour_start'] = pd.to_datetime(train['time_start'], format = '%H:%M', errors = 'coerce').dt.hour

In [8]:
#idem para el test
fechaYhora = test['start_date'].str.split(' ')
fecha = fechaYhora.map(lambda x: x[0])
hora = fechaYhora.map(lambda x: x[1])
test = test.assign(date_start = fecha, time_start = hora)

test['date_start'] = pd.to_datetime(test['date_start'])
test['weekday'] = test['date_start'].map(lambda x: x.weekday_name)
test['hour_start'] = pd.to_datetime(test['time_start'], format = '%H:%M', errors = 'coerce').dt.hour

In [9]:
train['subscription_type'] = train.subscription_type.map({'Subscriber':1, 'Customer':0})
train['weekday'] = train.weekday.map({'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4,\
                                      'Friday': 5, 'Saturday': 6, 'Sunday': 7})

In [10]:
test['weekday'] = test.weekday.map({'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7})
test['subscription_type'] = test.subscription_type.map({'Subscriber':1, 'Customer':0})

In [11]:
train['month_start'] = train['date_start'].map(lambda x: x.month)
test['month_start'] = test['date_start'].map(lambda x: x.month)

In [12]:
#cambio zip code a numerico
train.zip_code = train.zip_code.str.replace('nil','')
train.zip_code = pd.to_numeric(train.zip_code,errors='coerce')
test.zip_code = test.zip_code.str.replace('nil','')
test.zip_code = pd.to_numeric(test.zip_code,errors='coerce')


In [13]:
train.dtypes

id                             int64
duration                       int64
start_date                    object
start_station_name            object
start_station_id               int64
bike_id                        int64
subscription_type              int64
zip_code                     float64
date_start            datetime64[ns]
time_start                    object
weekday                        int64
hour_start                     int64
month_start                    int64
dtype: object

In [14]:
cols = ['id', 'start_station_id', 'bike_id', 'subscription_type', 'month_start', 'weekday', 'hour_start']
features = train[cols]
duration = train.duration

In [15]:
test1 = test[cols]

In [16]:
features.head(3)

Unnamed: 0,id,start_station_id,bike_id,subscription_type,month_start,weekday,hour_start
0,907649,50,187,1,8,4,8
1,384043,67,417,1,7,1,22
2,316176,77,281,1,6,1,8


# Linear Regression

In [17]:
x_train, x_test, y_train, y_test = train_test_split(features, duration, test_size = 0.6, random_state = 0)

reg = linear_model.LinearRegression()

reg.fit(x_train, y_train) #entreno el algoritmo

#prediccion

pred = reg.predict(x_test)

score = mean_squared_error(pred,y_test)
score

#calculo el error de mi prediccion

#error = (pred - y_test) ** 2

#otra forma mejor
# error = no.sqrt(metrics.mean_squared_error(pred, y_test))

#para que servia reg.score(x_test, y_test) 

948149745.27316415

In [18]:
prediccion1 = reg.predict(test1)

In [19]:
tripIdDuration = trip[['id','duration']]

In [20]:
tripIdDuration

Unnamed: 0,id,duration
0,4576,63
1,4607,70
2,4130,71
3,4251,77
4,4299,83
5,4927,103
6,4500,109
7,4563,111
8,4760,113
9,4258,114


In [21]:
target = pd.DataFrame(test['id'])

In [22]:
test['id']

0         504737
1         530846
2         813140
3         897674
4         322830
5         487841
6         677808
7         704449
8         833587
9         420411
10        354777
11         17809
12        854667
13        673325
14        729900
15          4485
16        349394
17        784460
18        908431
19        805578
20        609571
21        269385
22        227320
23        658277
24        205479
25        429936
26        176953
27         64226
28        489619
29        831189
           ...  
119968     60831
119969    875546
119970    126578
119971    244611
119972    110019
119973    114911
119974    649284
119975    719648
119976    536952
119977    579153
119978    619244
119979    277802
119980    572524
119981    273279
119982    438379
119983    322903
119984    226880
119985     34690
119986    581506
119987    365644
119988     18373
119989    646921
119990    835970
119991    100540
119992    503083
119993    274799
119994    464786
119995    1728

In [23]:
la = pd.merge(tripIdDuration,target, how = "right",on='id')

In [24]:
la

Unnamed: 0,id,duration
0,4130,71
1,4498,126
2,4557,130
3,4386,134
4,4242,141
5,4550,163
6,4917,169
7,4841,197
8,4704,204
9,4824,207


In [25]:
testTarget = pd.merge(tripIdDuration,target, how = "right",on='id')

In [26]:
testTarget.head(3)

Unnamed: 0,id,duration
0,4130,71
1,4498,126
2,4557,130


In [27]:
duration_real = testTarget['duration']

In [28]:
score_test = mean_squared_error(prediccion1,duration_real)
score_test

44298018.733980663

# Codigo para sacar afuera el csv para submit

In [29]:
ids = test.id

In [30]:
data = {'id': ids, 'duration': prediccion1}
submit1 = pd.DataFrame(data)

In [31]:
submit1.head(3)

Unnamed: 0,duration,id
0,4004.673128,504737
1,636.435005,530846
2,543.025317,813140


In [32]:
#submit1.to_csv('tp_bike/submit1.csv',index=False)

# Agregado de merge

In [33]:
train.head(3)

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,bike_id,subscription_type,zip_code,date_start,time_start,weekday,hour_start,month_start
0,907649,396,8/27/2015 8:36,Harry Bridges Plaza (Ferry Building),50,187,1,94602.0,2015-08-27,8:36,4,8,8
1,384043,636,7/28/2014 22:06,Market at 10th,67,417,1,94133.0,2014-07-28,22:06,1,22,7
2,316176,334,6/9/2014 8:42,Market at Sansome,77,281,1,94107.0,2014-06-09,8:42,1,8,6


In [34]:
train.loc[train.zip_code.isnull(),'zip_code'] = 0

In [35]:
test.loc[test.zip_code.isnull(),'zip_code'] = 0

In [36]:
train['zip_code'].isnull()

0         False
1         False
2         False
3         False
4         False
5         False
6         False
7         False
8         False
9         False
10        False
11        False
12        False
13        False
14        False
15        False
16        False
17        False
18        False
19        False
20        False
21        False
22        False
23        False
24        False
25        False
26        False
27        False
28        False
29        False
          ...  
549931    False
549932    False
549933    False
549934    False
549935    False
549936    False
549937    False
549938    False
549939    False
549940    False
549941    False
549942    False
549943    False
549944    False
549945    False
549946    False
549947    False
549948    False
549949    False
549950    False
549951    False
549952    False
549953    False
549954    False
549955    False
549956    False
549957    False
549958    False
549959    False
549960    False
Name: zip_code, dtype: b

In [37]:
train.dtypes

id                             int64
duration                       int64
start_date                    object
start_station_name            object
start_station_id               int64
bike_id                        int64
subscription_type              int64
zip_code                     float64
date_start            datetime64[ns]
time_start                    object
weekday                        int64
hour_start                     int64
month_start                    int64
dtype: object

In [38]:
cols = ['id', 'start_station_id', 'bike_id', 'subscription_type', 'month_start', 'weekday', 'hour_start','zip_code']
features = train[cols]

In [39]:
test2 = test[cols]

In [40]:
x_train, x_test, y_train, y_test = train_test_split(features, duration, test_size = 0.6, random_state = 0)

reg = linear_model.LinearRegression()

reg.fit(x_train, y_train)
pred = reg.predict(test2)

In [41]:
score_test = mean_squared_error(pred,duration_real)
score_test

44891332.068091244

In [42]:
cols = ['id','start_station_id','bike_id', 'subscription_type', 'month_start', 'weekday', 'hour_start']
features = train[cols]

In [43]:
test2 = test[cols]

In [44]:
one_hot_encoding = pd.get_dummies(features['weekday'])
features = features.drop('weekday',axis=1)
features = features.join(one_hot_encoding)
features

Unnamed: 0,id,start_station_id,bike_id,subscription_type,month_start,hour_start,1,2,3,4,5,6,7
0,907649,50,187,1,8,8,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,384043,67,417,1,7,22,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,316176,77,281,1,6,8,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,618874,69,634,1,1,16,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,910977,67,607,1,8,15,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,522083,65,370,1,10,7,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,880809,64,443,1,8,17,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,488938,71,485,1,10,14,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,899522,62,603,1,8,7,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,737380,76,86,0,4,6,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [45]:
one_hot_encoding = pd.get_dummies(test2['weekday'])
test2 = test2.drop('weekday',axis=1)
test2 = test2.join(one_hot_encoding)
test2

Unnamed: 0,id,start_station_id,bike_id,subscription_type,month_start,hour_start,1,2,3,4,5,6,7
0,504737,60,426,0,10,11,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,530846,51,454,1,11,13,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,813140,70,370,1,6,17,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,897674,72,451,1,8,7,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,322830,69,603,1,6,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,487841,61,478,1,10,21,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,677808,67,505,1,3,18,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,704449,48,356,1,3,17,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,833587,67,401,1,7,11,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,420411,48,363,1,8,13,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [46]:
x_train, x_test, y_train, y_test = train_test_split(features, duration, test_size = 0.6, random_state = 0)

reg = linear_model.LinearRegression()

reg.fit(x_train, y_train)
pred = reg.predict(test2)

In [47]:
score_test = mean_squared_error(pred,duration_real)
score_test

44307558.49363292

In [48]:
predTrain = reg.predict(x_test)

In [49]:
mean_squared_error(predTrain,y_test)

948136066.56417203

# Mergeo

In [50]:
#Creo el dataframe con los zip code de cada cuidad para despues mergear
dato = { 'city' : ['San Jose','Palo Alto','San Francisco','Redwood City','Mountain View'],\
       'zip_code': [95113,94301,94107,94063,94041]}
cityZipCode = pd.DataFrame(dato)
cityZipCode

Unnamed: 0,city,zip_code
0,San Jose,95113
1,Palo Alto,94301
2,San Francisco,94107
3,Redwood City,94063
4,Mountain View,94041


In [51]:
train.shape

(549961, 13)

In [52]:
weather.shape

(3665, 24)

In [53]:
#Mergeo asi consigo el zip
stationZip = pd.merge(station,cityZipCode,on='city',how='outer')

In [54]:
stationZip.head(3)

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date,zip_code
0,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,8/6/2013,95113
1,3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013,95113
2,4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013,95113


In [55]:
stationZip['zip_code'].value_counts()

94107    35
95113    16
94063     7
94041     7
94301     5
Name: zip_code, dtype: int64

In [56]:
weather['date'] = pd.to_datetime(weather['date'])

In [57]:
# Mergeo con weather asi consigo la ciudad ademas del zip code
weatherCity = pd.merge(weather,cityZipCode,on='zip_code',how='inner')
weatherCity.head(3)

Unnamed: 0,date,max_temperature_f,mean_temperature_f,min_temperature_f,max_dew_point_f,mean_dew_point_f,min_dew_point_f,max_humidity,mean_humidity,min_humidity,...,min_visibility_miles,max_wind_Speed_mph,mean_wind_speed_mph,max_gust_speed_mph,precipitation_inches,cloud_cover,events,wind_dir_degrees,zip_code,city
0,2013-08-29,74.0,68.0,61.0,61.0,58.0,56.0,93.0,75.0,57.0,...,10.0,23.0,11.0,28.0,0,4.0,,286.0,94107,San Francisco
1,2013-08-30,78.0,69.0,60.0,61.0,58.0,56.0,90.0,70.0,50.0,...,7.0,29.0,13.0,35.0,0,2.0,,291.0,94107,San Francisco
2,2013-08-31,71.0,64.0,57.0,57.0,56.0,54.0,93.0,75.0,57.0,...,10.0,26.0,15.0,31.0,0,4.0,,284.0,94107,San Francisco


In [58]:
weatherCity = weatherCity[['date','cloud_cover','events','city','zip_code','wind_dir_degrees']]

In [59]:
weatherCity['zip_code'].value_counts()

95113    733
94301    733
94107    733
94063    733
94041    733
Name: zip_code, dtype: int64

In [60]:
weatherCity.shape

(3665, 6)

In [61]:
stationZip.shape

(70, 8)

In [62]:
train.shape

(549961, 13)

In [63]:
stationZip

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date,zip_code
0,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,8/6/2013,95113
1,3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013,95113
2,4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013,95113
3,5,Adobe on Almaden,37.331415,-121.893200,19,San Jose,8/5/2013,95113
4,6,San Pedro Square,37.336721,-121.894074,15,San Jose,8/7/2013,95113
5,7,Paseo de San Antonio,37.333798,-121.886943,15,San Jose,8/7/2013,95113
6,8,San Salvador at 1st,37.330165,-121.885831,15,San Jose,8/5/2013,95113
7,9,Japantown,37.348742,-121.894715,15,San Jose,8/5/2013,95113
8,10,San Jose City Hall,37.337391,-121.886995,15,San Jose,8/6/2013,95113
9,11,MLK Library,37.335885,-121.885660,19,San Jose,8/6/2013,95113


In [64]:
stationZip = stationZip.rename( columns = { 'name'  : 'start_station_name' } )

In [65]:
stationZip = stationZip[['start_station_name','city']]

In [66]:
stationZip.head(3)

Unnamed: 0,start_station_name,city
0,San Jose Diridon Caltrain Station,San Jose
1,San Jose Civic Center,San Jose
2,Santa Clara at Almaden,San Jose


In [67]:
train2 = pd.merge(stationZip,train,on='start_station_name',how='inner')

In [68]:
testStation = pd.merge(stationZip,test,on='start_station_name',how='right')

In [69]:
testStation.shape

(119998, 13)

In [70]:
train2.shape

(540635, 14)

In [71]:
train2.head(3)

Unnamed: 0,start_station_name,city,id,duration,start_date,start_station_id,bike_id,subscription_type,zip_code,date_start,time_start,weekday,hour_start,month_start
0,San Jose Diridon Caltrain Station,San Jose,109338,372,11/27/2013 7:46,2,20,1,95377.0,2013-11-27,7:46,3,7,11
1,San Jose Diridon Caltrain Station,San Jose,740113,2408,4/24/2015 16:34,2,138,1,95113.0,2015-04-24,16:34,5,16,4
2,San Jose Diridon Caltrain Station,San Jose,61560,420,10/17/2013 8:46,2,657,1,94002.0,2013-10-17,8:46,4,8,10


In [72]:
weatherCity.head(3)

Unnamed: 0,date,cloud_cover,events,city,zip_code,wind_dir_degrees
0,2013-08-29,4.0,,San Francisco,94107,286.0
1,2013-08-30,2.0,,San Francisco,94107,291.0
2,2013-08-31,4.0,,San Francisco,94107,284.0


In [73]:
weatherCity = weatherCity.rename( columns = { 'date'  : 'date_start' } )

In [74]:
weatherCity.loc[weatherCity.events == 'rain', 'events'] = "Rain"
weatherCity.loc[weatherCity.events.isnull(), 'events'] = "Normal"

In [75]:
weatherCity['events'] = weatherCity.events.map({'Normal': 1, 'Rain': 2, 'Fog': 3, 'Fog-Rain': 4,\
                                      'Rain-Thunderstorm': 5})

In [76]:
weatherCity.dtypes

date_start          datetime64[ns]
cloud_cover                float64
events                       int64
city                        object
zip_code                     int64
wind_dir_degrees           float64
dtype: object

In [77]:
weatherCity['cloud_cover'].value_counts()

0.0    834
1.0    490
2.0    478
3.0    474
4.0    456
5.0    400
6.0    283
7.0    180
8.0     69
Name: cloud_cover, dtype: int64

In [78]:
weatherCity['wind_dir_degrees'].value_counts()

350.0    47
299.0    46
349.0    45
291.0    43
346.0    42
337.0    42
343.0    41
340.0    38
303.0    38
297.0    38
293.0    36
298.0    35
296.0    35
302.0    35
294.0    35
311.0    35
338.0    34
352.0    34
305.0    34
276.0    34
300.0    33
341.0    33
328.0    32
288.0    32
339.0    32
295.0    32
353.0    31
342.0    31
304.0    30
334.0    30
         ..
109.0     1
117.0     1
120.0     1
169.0     1
172.0     1
177.0     1
181.0     1
182.0     1
183.0     1
195.0     1
221.0     1
99.0      1
96.0      1
95.0      1
69.0      1
53.0      1
55.0      1
59.0      1
60.0      1
61.0      1
67.0      1
71.0      1
94.0      1
72.0      1
79.0      1
80.0      1
84.0      1
87.0      1
92.0      1
0.0       1
Name: wind_dir_degrees, dtype: int64

In [79]:
weatherCity.head(3)

Unnamed: 0,date_start,cloud_cover,events,city,zip_code,wind_dir_degrees
0,2013-08-29,4.0,1,San Francisco,94107,286.0
1,2013-08-30,2.0,1,San Francisco,94107,291.0
2,2013-08-31,4.0,1,San Francisco,94107,284.0


In [80]:
trainWeather = pd.merge(train2,weatherCity,on=['date_start','city'],how='inner')

In [81]:
testWeather = pd.merge(testStation,weatherCity,on=['date_start','city'],how='left')

In [82]:
testWeather.shape

(119998, 17)

In [83]:
trainWeather.shape

(540635, 18)

In [84]:
trainWeather.head(3)

Unnamed: 0,start_station_name,city,id,duration,start_date,start_station_id,bike_id,subscription_type,zip_code_x,date_start,time_start,weekday,hour_start,month_start,cloud_cover,events,zip_code_y,wind_dir_degrees
0,San Jose Diridon Caltrain Station,San Jose,109338,372,11/27/2013 7:46,2,20,1,95377.0,2013-11-27,7:46,3,7,11,4.0,1,95113,284.0
1,San Jose Diridon Caltrain Station,San Jose,109583,521,11/27/2013 10:23,2,239,1,94404.0,2013-11-27,10:23,3,10,11,4.0,1,95113,284.0
2,San Jose Diridon Caltrain Station,San Jose,109692,440,11/27/2013 12:46,2,235,1,95126.0,2013-11-27,12:46,3,12,11,4.0,1,95113,284.0


In [85]:
testWeather.head(3)

Unnamed: 0,start_station_name,city,id,start_date,start_station_id,bike_id,subscription_type,zip_code_x,date_start,time_start,weekday,hour_start,month_start,cloud_cover,events,zip_code_y,wind_dir_degrees
0,San Jose Diridon Caltrain Station,San Jose,795571,6/5/2015 10:04,2,152,1,94103.0,2015-06-05,10:04,5,10,6,2.0,1.0,95113.0,233.0
1,San Jose Diridon Caltrain Station,San Jose,895359,8/18/2015 17:37,2,250,1,97214.0,2015-08-18,17:37,2,17,8,2.0,1.0,95113.0,283.0
2,San Jose Diridon Caltrain Station,San Jose,58423,10/15/2013 8:16,2,234,1,94103.0,2013-10-15,8:16,2,8,10,0.0,1.0,95113.0,292.0


In [86]:
trainWeather = trainWeather.rename( columns = { 'zip_code_x'  : 'zip_code', 'zip_code_y' : 'zip_code_city' } )
testWeather = testWeather.rename( columns = { 'zip_code_x'  : 'zip_code', 'zip_code_y' : 'zip_code_city' } )

In [87]:
trainWeather.head(3)

Unnamed: 0,start_station_name,city,id,duration,start_date,start_station_id,bike_id,subscription_type,zip_code,date_start,time_start,weekday,hour_start,month_start,cloud_cover,events,zip_code_city,wind_dir_degrees
0,San Jose Diridon Caltrain Station,San Jose,109338,372,11/27/2013 7:46,2,20,1,95377.0,2013-11-27,7:46,3,7,11,4.0,1,95113,284.0
1,San Jose Diridon Caltrain Station,San Jose,109583,521,11/27/2013 10:23,2,239,1,94404.0,2013-11-27,10:23,3,10,11,4.0,1,95113,284.0
2,San Jose Diridon Caltrain Station,San Jose,109692,440,11/27/2013 12:46,2,235,1,95126.0,2013-11-27,12:46,3,12,11,4.0,1,95113,284.0


In [88]:
testWeather.head(3)

Unnamed: 0,start_station_name,city,id,start_date,start_station_id,bike_id,subscription_type,zip_code,date_start,time_start,weekday,hour_start,month_start,cloud_cover,events,zip_code_city,wind_dir_degrees
0,San Jose Diridon Caltrain Station,San Jose,795571,6/5/2015 10:04,2,152,1,94103.0,2015-06-05,10:04,5,10,6,2.0,1.0,95113.0,233.0
1,San Jose Diridon Caltrain Station,San Jose,895359,8/18/2015 17:37,2,250,1,97214.0,2015-08-18,17:37,2,17,8,2.0,1.0,95113.0,283.0
2,San Jose Diridon Caltrain Station,San Jose,58423,10/15/2013 8:16,2,234,1,94103.0,2013-10-15,8:16,2,8,10,0.0,1.0,95113.0,292.0


In [89]:
trainWeather.loc[trainWeather.cloud_cover.isnull(), 'cloud_cover'] = 0

In [90]:
trainWeather['cloud_cover'].value_counts()

5.0    91022
4.0    80924
1.0    68629
6.0    66757
3.0    61471
0.0    55946
2.0    55795
7.0    43923
8.0    16168
Name: cloud_cover, dtype: int64

In [91]:
trainWeather.dtypes

start_station_name            object
city                          object
id                             int64
duration                       int64
start_date                    object
start_station_id               int64
bike_id                        int64
subscription_type              int64
zip_code                     float64
date_start            datetime64[ns]
time_start                    object
weekday                        int64
hour_start                     int64
month_start                    int64
cloud_cover                  float64
events                         int64
zip_code_city                  int64
wind_dir_degrees             float64
dtype: object

In [92]:
#trainWeather['cloud_cover'] = trainWeather['cloud_cover']

In [93]:
test.shape

(119998, 12)

In [94]:
trainWeather.shape

(540635, 18)

In [95]:
stationZip.shape

(70, 2)

In [96]:
#trainCity = pd.merge(train,station,on='')

In [97]:
# train.groupby('start_station_name').count().sort_values('id')

In [98]:
# station.groupby('name').count()

In [99]:
# weather['zip_code'].value_counts()

In [100]:
# train.head(3)

In [101]:
trainWeather = trainWeather.fillna(0)
testWeather = testWeather.fillna(0)

# Testeo Machine Learning con otras columnas, weather, etc

In [102]:
testWeather.head(3)

Unnamed: 0,start_station_name,city,id,start_date,start_station_id,bike_id,subscription_type,zip_code,date_start,time_start,weekday,hour_start,month_start,cloud_cover,events,zip_code_city,wind_dir_degrees
0,San Jose Diridon Caltrain Station,San Jose,795571,6/5/2015 10:04,2,152,1,94103.0,2015-06-05,10:04,5,10,6,2.0,1.0,95113.0,233.0
1,San Jose Diridon Caltrain Station,San Jose,895359,8/18/2015 17:37,2,250,1,97214.0,2015-08-18,17:37,2,17,8,2.0,1.0,95113.0,283.0
2,San Jose Diridon Caltrain Station,San Jose,58423,10/15/2013 8:16,2,234,1,94103.0,2013-10-15,8:16,2,8,10,0.0,1.0,95113.0,292.0


In [103]:
testTarget.head(3)

Unnamed: 0,id,duration
0,4130,71
1,4498,126
2,4557,130


##  Fin del mergeo, Ahora se usara trainWeather y testWeather

In [104]:
cols = ['id','start_station_id','bike_id', 'subscription_type', 'month_start', 'weekday', 'hour_start','events',\
       'wind_dir_degrees','cloud_cover']
features = trainWeather[cols]
target = testWeather[cols]

In [105]:
target.head(3)

Unnamed: 0,id,start_station_id,bike_id,subscription_type,month_start,weekday,hour_start,events,wind_dir_degrees,cloud_cover
0,795571,2,152,1,6,5,10,1.0,233.0,2.0
1,895359,2,250,1,8,2,17,1.0,283.0,2.0
2,58423,2,234,1,10,2,8,1.0,292.0,0.0


In [106]:
durationMerge = trainWeather['duration']

In [107]:
features.head(3)

Unnamed: 0,id,start_station_id,bike_id,subscription_type,month_start,weekday,hour_start,events,wind_dir_degrees,cloud_cover
0,109338,2,20,1,11,3,7,1,284.0,4.0
1,109583,2,239,1,11,3,10,1,284.0,4.0
2,109692,2,235,1,11,3,12,1,284.0,4.0


In [108]:
features.dtypes

id                     int64
start_station_id       int64
bike_id                int64
subscription_type      int64
month_start            int64
weekday                int64
hour_start             int64
events                 int64
wind_dir_degrees     float64
cloud_cover          float64
dtype: object

In [109]:
x_train, x_test, y_train, y_test = train_test_split(features, durationMerge, test_size = 0.6, random_state = 0)

reg = linear_model.LinearRegression()

reg.fit(x_train, y_train)
pred = reg.predict(target)
score_test = mean_squared_error(pred,duration_real)
score_test

44861996.875287145

# Testeo sobre el mismo train

In [110]:
predTrain = reg.predict(x_test)

In [111]:
mean_squared_error(predTrain,y_test)

57545975.760059536

In [112]:
ids = testWeather['id']
data = {'id': ids, 'duration': pred}
submit7 = pd.DataFrame(data)

In [113]:
submit7.head(3)

Unnamed: 0,duration,id
0,1129.587143,795571
1,940.223859,895359
2,581.191302,58423


In [114]:
resul = pd.merge(submit7,testTarget,on='id')

In [115]:
mean_squared_error(resul['duration_x'],resul['duration_y'])

41724027.64296864

In [116]:
resul

Unnamed: 0,duration_x,id,duration_y
0,1129.587143,795571,262
1,940.223859,895359,815
2,581.191302,58423,376
3,678.516739,526102,371
4,4102.572118,22836,408
5,758.731726,835944,604
6,1138.168902,73600,850
7,382.899683,31133,541
8,751.649733,377827,327
9,1008.059347,380951,610


In [117]:
submit7.to_csv('tp_bike/submit7.csv',index=False)

In [118]:
def KBest(data,y,numK):
    columns = data.columns
    selector = SelectKBest(f_regression, k=numK)
    selector.fit_transform(data, y)
    labels = [columns[x] for x in selector.get_support(indices=True) if x]
    return pd.DataFrame(selector.fit_transform(data,y), columns=labels)

In [119]:
def fromModel(modelo,data):
    columns = data.columns
    model = SelectFromModel(modelo, prefit=True)
    labels = [columns[x] for x in model.get_support(indices=True) if x]
    X_new = model.transform(data)
    return pd.DataFrame(model.transform(data), columns=labels)

In [120]:
#def cvScore(modelo):
#    scores = cross_val_score(modelo, X_train, y_train, cv=15, n_jobs=1, scoring = 'mean_squared_error')
#    print (scores)

#  Aplico KBest para que me de las columnas "relevantes"

In [121]:
KBest(features,durationMerge,5)

Unnamed: 0,start_station_id,subscription_type,month_start,weekday,events
0,2.0,1.0,11.0,3.0,1.0
1,2.0,1.0,11.0,3.0,1.0
2,2.0,1.0,11.0,3.0,1.0
3,2.0,1.0,11.0,3.0,1.0
4,3.0,1.0,11.0,3.0,1.0
5,3.0,0.0,11.0,3.0,1.0
6,3.0,0.0,11.0,3.0,1.0
7,4.0,1.0,11.0,3.0,1.0
8,4.0,1.0,11.0,3.0,1.0
9,4.0,1.0,11.0,3.0,1.0


In [122]:
cols = ['start_station_id','subscription_type', 'month_start', 'weekday','events']
features = trainWeather[cols]
target = testWeather[cols]

In [123]:
x_train, x_test, y_train, y_test = train_test_split(features, durationMerge, test_size = 0.6, random_state = 0)

reg = linear_model.LinearRegression()

reg.fit(x_train, y_train)
pred = reg.predict(target)
score_test = mean_squared_error(pred,duration_real)
score_test

44818756.805438206

# Testeo sobre el mismo train

In [124]:
predTrain = reg.predict(x_test)

In [125]:
mean_squared_error(predTrain,y_test)

57517180.18737264

# Con menos columnas

In [126]:
duration_real

0            71
1           126
2           130
3           134
4           141
5           163
6           169
7           197
8           204
9           207
10          211
11          218
12          236
13          236
14          254
15          267
16          284
17          288
18          292
19          307
20          319
21          337
22          340
23          346
24          346
25          348
26          351
27          353
28          355
29          360
          ...  
119968      961
119969      532
119970    71035
119971      844
119972     1926
119973      672
119974     7542
119975      913
119976      727
119977     1207
119978     1931
119979      869
119980     9001
119981     1021
119982      406
119983     1251
119984      873
119985    13635
119986     1750
119987     2255
119988      558
119989      956
119990      925
119991      147
119992      661
119993     7503
119994      630
119995      161
119996      398
119997     6712
Name: duration, dtype: i

In [127]:
test

Unnamed: 0,id,start_date,start_station_name,start_station_id,bike_id,subscription_type,zip_code,date_start,time_start,weekday,hour_start,month_start
0,504737,10/18/2014 11:25,Embarcadero at Sansome,60,426,0,77009.0,2014-10-18,11:25,6,11,10
1,530846,11/5/2014 13:00,Embarcadero at Folsom,51,454,1,94132.0,2014-11-05,13:00,3,13,11
2,813140,6/18/2015 17:34,San Francisco Caltrain (Townsend at 4th),70,370,1,94107.0,2015-06-18,17:34,4,17,6
3,897674,8/20/2015 7:06,Civic Center BART (7th at Market),72,451,1,94582.0,2015-08-20,7:06,4,7,8
4,322830,6/13/2014 8:46,San Francisco Caltrain 2 (330 Townsend),69,603,1,95014.0,2014-06-13,8:46,5,8,6
5,487841,10/7/2014 21:41,2nd at Townsend,61,478,1,94115.0,2014-10-07,21:41,2,21,10
6,677808,3/11/2015 18:09,Market at 10th,67,505,1,94025.0,2015-03-11,18:09,3,18,3
7,704449,3/30/2015 17:29,Embarcadero at Vallejo,48,356,1,94536.0,2015-03-30,17:29,1,17,3
8,833587,7/5/2015 11:54,Market at 10th,67,401,1,94102.0,2015-07-05,11:54,7,11,7
9,420411,8/22/2014 13:30,Embarcadero at Vallejo,48,363,1,94114.0,2014-08-22,13:30,5,13,8


In [128]:
data = {'id': ids, 'duration': pred}
submit8 = pd.DataFrame(data)
submit8.head(3)

Unnamed: 0,duration,id
0,960.83631,795571
1,762.241671,895359
2,833.033775,58423


In [129]:
submit8.to_csv('tp_bike/submit8.csv',index=False)

In [130]:
rg = linear_model.LinearRegression().fit(x_train, y_train)
fromModel(rg,features)

Unnamed: 0,subscription_type
0,1
1,1
2,1
3,1
4,1
5,0
6,0
7,1
8,1
9,1


# SVR

In [131]:
#clf = SVR(C=1.0, epsilon=0.2)
#x_train, x_test, y_train, y_test = train_test_split(features, duration, test_size = 0.6, random_state = 0)
#clf.fit(x_train, y_train)

In [132]:
#pred2 = clf.predict(test2)