In [None]:
!pip install pyspark
!pip install haversine
!pip install seaborn

In [1]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType, StringType

spark=SparkSession.builder.appName('FordGoBike').getOrCreate()
spark

In [2]:
path="G:\My Drive\Develhope\develhope-Data5-Team3\Data\Raw_FordGoBike"
df=spark.read.csv(path,header=True, inferSchema=True)

df.describe().show()

+-------+------------------+-----------------+--------------------+----------------------+-----------------------+-----------------+--------------------+--------------------+---------------------+------------------+----------+------------------+-------------+-----------+
|summary|      duration_sec| start_station_id|  start_station_name|start_station_latitude|start_station_longitude|   end_station_id|    end_station_name|end_station_latitude|end_station_longitude|           bike_id| user_type| member_birth_year|member_gender|     pyment|
+-------+------------------+-----------------+--------------------+----------------------+-----------------------+-----------------+--------------------+--------------------+---------------------+------------------+----------+------------------+-------------+-----------+
|  count|            519700|           519700|              519700|                519700|                 519700|           519700|              519700|              519700|          

In [3]:
df.columns

['duration_sec',
 'start_time',
 'end_time',
 'start_station_id',
 'start_station_name',
 'start_station_latitude',
 'start_station_longitude',
 'end_station_id',
 'end_station_name',
 'end_station_latitude',
 'end_station_longitude',
 'bike_id',
 'user_type',
 'member_birth_year',
 'member_gender',
 'pyment']

In [17]:
df_2=df.withColumn('bigger',F.col('start_time')>F.col('end_time'))



In [22]:
df_2.filter(df_2['bigger']==True).show()

+------------+--------------------+--------------------+----------------+------------------+----------------------+-----------------------+--------------+-----------------+--------------------+---------------------+-------+----------+-----------------+-------------+-----------+------+
|duration_sec|          start_time|            end_time|start_station_id|start_station_name|start_station_latitude|start_station_longitude|end_station_id| end_station_name|end_station_latitude|end_station_longitude|bike_id| user_type|member_birth_year|member_gender|     pyment|bigger|
+------------+--------------------+--------------------+----------------+------------------+----------------------+-----------------------+--------------+-----------------+--------------------+---------------------+-------+----------+-----------------+-------------+-----------+------+
|         292|2017-11-05 04:59:...|2017-11-05 04:04:...|             121|Mission Playground|            37.7592103|           -122.4213392|   

##WEEK 1 - Calcualte Haversine Distance

In [None]:
from haversine import haversine

def haversine_f(lat1, lon1, lat2, lon2):
    return haversine( (lat1, lon1), (lat2, lon2),unit='m',normalize=True )

haversine_udf = F.udf(haversine_f)

In [None]:
df=df.withColumn('haversine_distance', 
                    haversine_udf(F.col('start_station_latitude'), F.col('start_station_longitude'), 
                                  F.col('end_station_latitude'), F.col('end_station_longitude'))
                    )

##WEEK 2

##TASK 2 - Calculate the trip cost

In [None]:
df=df.withColumn('start_time',F.to_timestamp('start_time','HH:mm:ss'))\
    .withColumn('end_time',F.to_timestamp('end_time','HH:mm:ss'))\
    .withColumn('Diff_in_seconds',F.col('end_time').cast('long')-F.col('start_time').cast('long'))\
    .withColumn('Diff_in_minutes',(F.col('Diff_in_seconds')/60))\
    .withColumn('Trip_cost',(F.col('Diff_in_minutes')*0.35))

TASK 3 - Calculate the total distance for each bike and list the top 10

In [None]:
df.groupBy("bike_id").agg(F.sum("haversine_distance").alias("sum_distance")).sort(F.desc("sum_distance")).show(10)

##WEEK 3

TASK 1

In [None]:
df.groupby('member_gender').count().show()

In [None]:
df11 = df3.withColumn('F', F.when(F.col('member_gender') == 'Female', 1).otherwise(0)).withColumn('M', F.when(F.col('member_gender') == 'Male', 1).otherwise(0))

df11.select('F', 'M','member_gender').show(10)

In [None]:
df4 = df11.groupBy('start_station_id').agg({'trip(s)':'count', 'F':'sum', 'M': 'sum'}).orderBy(desc('count(trip(s))')).head(10)

In [None]:
from pyspark.sql.functions import desc
df4 = df11.groupBy('start_station_id').agg({'trip(s)':'count', 'F':'sum', 'M': 'sum'}).orderBy(desc('count(trip(s))')).limit(10)

In [None]:
df5 = df4.withColumn('F_ratio', df4[1]/df4[3])
#df5.show()
df6 = df5.withColumn('M_ratio', df4[2]/df4[3])
df6.show(10)

TASK 2

In [None]:
df.select('user_type').distinct().show()

In [None]:
df.groupBy('user_type').agg(F.round(F.count('user_type')/df.count()*100, 2).alias('User Percentage')).show()

TASK 3

In [None]:
df3_3=df.where(F.col('haversine_distance')!=0) #cleaning zero distance entries

In [None]:
df3_3=df3_3.withColumn('age',2017-F.col('member_birth_year')).withColumn('age',F.col('age').cast('int'))

In [None]:
df3_3=df3_3.where((F.col('age')>18) & (F.col('age')<=100)) #cleaning outlier values(age=100+)
df3_3=df3_3.dropna(how='all',subset=['age'])


In [None]:
df3_3=df3_3.withColumn('age_group', F.when(((F.col('age')>=18) & (F.col('age')<35)),'18 to 35')\
                      .otherwise(F.when((F.col('age')>=35)&(F.col('age')<50),'35 to 50')\
                      .otherwise(F.when((F.col('age')>=50)&(F.col('age')<65),'50 to 65')\
                                 .otherwise(F.when((F.col('age')>=65)&(F.col('age')<=100),'65 to 100').otherwise('non-groupped')))))

df3_3_ag=df3_3.groupby('age_group').agg(F.mean('haversine_distance').alias('Age Group/Distance')).sort(F.col('age_group'))

In [None]:
df3_3_ag=df3_3.groupby('age_group').agg(F.sum('haversine_distance').alias('Age Group/Distance')).sort(F.col('age_group'))

In [None]:
df3_3.where(F.col('age_group')=='non-groupped').show()

In [None]:
df3_3.write.option("header",True).mode('overwrite').csv('/workspaces/develhope-Data5-Team3/Data/week_3')

In [None]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

In [None]:
pd=df3_3_ag.toPandas()

pd.head()

In [None]:
sn.catplot(data=pd, x='Age Group/Distance', y="age_group", errorbar=("pi", 95), kind="bar", palette="ch:s=.25,rot=-.25")
plt.xlabel('Haversine Distance(km)', weight = 'bold')
plt.ylabel('Age Group', weight = 'bold')

In [None]:
df3_3_ng=df3_3.groupBy('age').agg(F.mean('haversine_distance').alias('Avg_Distance')).sort(F.col('Avg_Distance'))

In [None]:
df3_3_ng=df3_3.groupBy('age').agg(F.sum('haversine_distance').alias('Avg_Distance')).sort(F.col('Avg_Distance'))

In [None]:
df3_3_ng.show(100)

In [None]:
df2=df3_3_ng.toPandas()

sn.regplot(x= "age", y= "Avg_Distance", data=df2)
plt.xlabel('Age', weight = 'bold')
plt.ylabel('Haversine Distance(km)', weight = 'bold')

WEEK 4

#TASK 1

In [None]:
df.groupBy('user_type').agg({'Trip_cost': 'sum'}).show()

#TASK 2

In [None]:
df=df.withColumn('time_of_the_day', F.when(((F.hour('start_time')>=5) & (F.hour('start_time')<12)),'Morning')\
                      .otherwise(F.when((F.hour('start_time')>=12)&(F.hour('start_time')<18),'Afternoon')\
                      .otherwise(F.when((F.hour('start_time')>=18)&(F.hour('start_time')<21),'Evening')\
                                 .otherwise('Night'))))

In [None]:
df_time_of_day=df.groupBy('time_of_the_day').agg(F.count('start_time').alias('total_rents'))

df_time_of_day.show()

In [None]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

In [None]:
sn.barplot(x= pd["time_of_the_day"], y= pd['total_rents'], data=pd, order=['Morning','Afternoon','Evening','Night'])
plt.legend(['Morning: 5AM-12PM', 'Afternoon: 12PM-6PM','Evening: 6PM-9PM', 'Night: 9PM-5AM'])

In [None]:
df_hour=df.groupBy(F.hour('start_time').alias('hour')).agg(F.count('start_time').alias('total_count')).sort(F.hour('start_time'))
df_hour.show()

In [None]:
pd_2=df_hour.toPandas()
sn.barplot(x= pd_2["hour"], y= pd_2['total_count'], data=pd_2)

#TASK 3

In [None]:
df.columns

In [None]:
df.select(['start_station_name','start_time']).groupBy(['start_station_name', F.hour('start_time').alias('hour')]).agg(F.count('start_time').alias('# of Rents')).sort(['start_station_name','hour']).show(25)


df_station=df.select(['start_station_name','start_time']).groupBy(['start_station_name', F.hour('start_time').alias('hour')]).agg(F.count('start_time').alias('# of Rents')).sort(['start_station_name','hour'])
pd_station=df_station.toPandas()

In [None]:
plt.figure(figsize=(5,20))
sn.scatterplot(x= pd_station['hour'], y= pd_station['start_station_name'], hue='# of Rents',data=pd_station)
plt.autoscale(enable=True)
plt.xlabel(xlabel='Station Names')
plt.xticks(rotation=90, fontsize=6)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)

In [None]:
plt.figure(figsize=(35,4))
sn.scatterplot(x= pd_station['start_station_name'], y= pd_station['# of Rents'], hue='hour',data=pd_station)
plt.autoscale(enable=True)
plt.xlabel(xlabel='Station Names')
plt.xticks(rotation=90, fontsize=6)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)

In [None]:
plt.figure(figsize=(20,10))

sn.histplot(x= pd_station["hour"], y= pd_station['# of Rents'], data=pd_station, kde=True)

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)