Import Library

In [1]:
import threading
import math
import pandas as pd
import matplotlib.pyplot as plt

Open Data

In [2]:
df = pd.read_csv('../../train.csv')
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


Take 75% of the Data

In [3]:
df_sample = df.sample(frac=0.75, random_state=42)
rows = df_sample.to_dict(orient="records")
rows

[{'id': 'id2793718',
  'vendor_id': 2,
  'pickup_datetime': '2016-06-08 07:36:19',
  'dropoff_datetime': '2016-06-08 07:53:39',
  'passenger_count': 1,
  'pickup_longitude': -73.98561096191406,
  'pickup_latitude': 40.735942840576165,
  'dropoff_longitude': -73.98033142089844,
  'dropoff_latitude': 40.76046752929688,
  'store_and_fwd_flag': 'N',
  'trip_duration': 1040},
 {'id': 'id3485529',
  'vendor_id': 2,
  'pickup_datetime': '2016-04-03 12:58:11',
  'dropoff_datetime': '2016-04-03 13:11:58',
  'passenger_count': 1,
  'pickup_longitude': -73.9783935546875,
  'pickup_latitude': 40.76435089111328,
  'dropoff_longitude': -73.99162292480467,
  'dropoff_latitude': 40.749858856201165,
  'store_and_fwd_flag': 'N',
  'trip_duration': 827},
 {'id': 'id1816614',
  'vendor_id': 2,
  'pickup_datetime': '2016-06-05 02:49:13',
  'dropoff_datetime': '2016-06-05 02:59:27',
  'passenger_count': 5,
  'pickup_longitude': -73.98905944824217,
  'pickup_latitude': 40.744388580322266,
  'dropoff_longitud

Thread Filtering

In [4]:
def threaded_filter(data_chunk, result_list, index, threshold):
    result_list[index] = [row for row in data_chunk if row["trip_duration"] > threshold]

In [5]:
data_to_filter = rows  
num_threads = 4
threshold_duration = 1000
chunk_size = math.ceil(len(data_to_filter) / num_threads)

filtered_chunks = [None] * num_threads
threads = []
for i in range(num_threads):
    chunk = data_to_filter[i * chunk_size : (i + 1) * chunk_size]
    t = threading.Thread(target=threaded_filter, args=(chunk, filtered_chunks, i, threshold_duration))
    threads.append(t)
    t.start()

for t in threads:
    t.join()

filtered_rows = []
for chunk in filtered_chunks:
    if chunk:
        filtered_rows.extend(chunk)

Called The Filtered Data

In [8]:
print(" | ".join([f"{row['trip_duration']}s" for row in filtered_rows]))


1040s | 4967s | 1252s | 1499s | 1017s | 2971s | 1027s | 4946s | 1167s | 1004s | 1107s | 1748s | 1268s | 1373s | 1417s | 1618s | 1321s | 1247s | 1408s | 1134s | 1370s | 1796s | 1282s | 1585s | 2980s | 1212s | 1123s | 1029s | 1586s | 1048s | 1062s | 2060s | 1404s | 2877s | 1305s | 2102s | 1724s | 1724s | 1214s | 1307s | 1279s | 1279s | 2343s | 1324s | 1065s | 2245s | 1016s | 1109s | 1106s | 1066s | 1229s | 1227s | 1176s | 1050s | 1255s | 1096s | 1257s | 1519s | 1327s | 2286s | 2311s | 2285s | 2667s | 1230s | 1450s | 5584s | 1345s | 1030s | 1688s | 1085s | 2949s | 1021s | 1322s | 1817s | 3075s | 1172s | 1636s | 2269s | 1081s | 1591s | 1402s | 1113s | 1109s | 2750s | 1261s | 2004s | 1222s | 1044s | 2229s | 1200s | 1384s | 1569s | 1555s | 1436s | 1371s | 1427s | 2072s | 1128s | 3540s | 2050s | 1420s | 1569s | 1653s | 2078s | 1081s | 1093s | 1006s | 1070s | 1721s | 2006s | 1485s | 1383s | 2376s | 1394s | 3109s | 2349s | 1602s | 2592s | 1127s | 4246s | 1103s | 1311s | 1669s | 1061s | 1828s | 

In [7]:
df_filtered = pd.DataFrame(filtered_rows)
df_filtered.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2793718,2,2016-06-08 07:36:19,2016-06-08 07:53:39,1,-73.985611,40.735943,-73.980331,40.760468,N,1040
1,id0140657,1,2016-05-12 17:43:38,2016-05-12 19:06:25,4,-73.789497,40.646675,-73.987137,40.759232,N,4967
2,id2151697,2,2016-03-17 21:10:43,2016-03-17 21:31:35,1,-73.988419,40.760006,-73.98053,40.78289,N,1252
3,id2169697,2,2016-03-22 14:08:02,2016-03-22 14:33:01,1,-73.955017,40.764462,-73.996811,40.71656,N,1499
4,id2225613,1,2016-03-29 23:32:47,2016-03-29 23:49:44,1,-73.971535,40.794827,-73.931709,40.858223,N,1017
