<a href="https://colab.research.google.com/github/AylinNaebzadeh/NYC-Yellow-Taxi-Trip-Data-Analysis/blob/main/nyc_yellow_taxi_trip_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#  pip install -U -q PyDrive --> for importing google drive in colab

# New Section

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import numpy as np
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.preprocessing import LabelEncoder
from scipy.stats import zscore

In [None]:
def download_csv_file():
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)
  link = 'https://drive.google.com/open?id=1Gepus8IDhEiXBzk3U0aQGlwgw99SRdLx'
  fluff, id = link.split('=')
  downloaded = drive.CreateFile({'id':id}) 
  downloaded.GetContentFile('sample-nyc-data.csv')  


In [None]:
# def normalize_min_max(df):
    # max = df[col].loc[df[col].idxmax()]     
    # min = df[col].loc[df[col].idxmin()] 
    # normalized_col=(df[col] - min)/(max-min)

In [None]:
def create_histogram(df):
  dataframe = df
  del_columns = ["tpep_pickup_datetime", "tpep_dropoff_datetime",
               "pickup_longitude", "pickup_latitude",
               "RatecodeID", "store_and_fwd_flag",
               "dropoff_longitude", "dropoff_latitude",
               "improvement_surcharge", "tolls_amount",
               "tip_amount", "mta_tax", "extra", "pickup_date", "dropoff_date", "drop_month", "pick_month"]
  for col in del_columns:
    del dataframe[col]
  for col in dataframe.columns:
    plt.hist(dataframe[col], edgecolor='white')
    plt.xlabel(col)
    plt.yscale("log")
    plt.show()

In [None]:
def del_missing_value(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = df.isnull().sum()/df.isnull().count().sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    f, ax = plt.subplots(figsize=(15, 6))
    plt.xticks(rotation='90')
    sns.barplot(x=missing_data.index, y=missing_data['Percent'])
    plt.xlabel('df', fontsize=15)
    plt.ylabel('Percent of missing values', fontsize=15)
    plt.title('Percent missing data by feature', fontsize=15)
    missing_data

In [None]:
def pre_processing(df):
  df.drop_duplicates(inplace=True)
  del_missing_value(df)
  return df

In [None]:
def add_features(df):
  df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
  df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])
  df["pickup_date"] = df["tpep_pickup_datetime"].dt.date 
  df["dropoff_date"] = df["tpep_dropoff_datetime"].dt.date
  df["pick_month"] = pd.DatetimeIndex(df['tpep_pickup_datetime']).month
  df["drop_month"] = pd.DatetimeIndex(df['tpep_dropoff_datetime']).month
  df["trip_duration_hour"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 3600
  df["trip_duration_min"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60
  df2 = df[~df["trip_duration_min"].isin([0])]
  df2["speed"] = df2["trip_distance"] * 1.6093435 / df2["trip_duration_hour"]
  pd.set_option('display.max_rows', None)

  df2 = df2[df2["speed"] < 200]
  # df2 = df2[df2["trip_duration_min"] < 120]

  counter = 0
  # for item in df2.iterrows():
  #   if item[1]['trip_duration_min'] > 120:
  #     print(counter, item[1]['trip_duration_min'])
  #     counter += 1
  #     print("----------------")
  return df2

In [None]:
def calculate_min_max_var_avg(data):
    stats = pd.DataFrame()
    stats["min"] = data.min()
    stats["var"] = data.var()
    stats["avg"] = data.mean()
    stats["max"] = data.max()
    return stats

In [None]:
def convert_df_to_csv(df):
  df.to_csv('extended_data.csv')

In [None]:
def create_boxplot(data):
  """
  find an outlier using IQR method and box plot in 1-dimensional data.
  """
  new_data=data[['VendorID', 'passenger_count', 'trip_distance', 'payment_type',
                 'fare_amount', 'trip_duration_min', 'speed', 'total_amount', 'tip_amount']]
  for column in new_data:
    plt.figure()
    data.boxplot([column])

In [None]:
def create_correlation_matrix(data_df):
  new_data=data_df[['VendorID', 'passenger_count', 'RatecodeID', 'trip_distance', 'payment_type',
                 'fare_amount', 'trip_duration_hour', 'speed', 'total_amount', 'tip_amount']]

  label_encoder = LabelEncoder()
  data = new_data
  for i in range(len(data.columns)):
      column = new_data.columns[i]
      data[column] = label_encoder.fit_transform(new_data[column])
      print(f" dataframe {column} uniques: {len(data[column].unique())} ")

  x = data
  cor = x.corr().round(2)
  plt.figure(figsize=(10,8),linewidth=10,edgecolor="#04253a" )
  sns.heatmap(cor, annot=True, cmap="Blues")
  plt.show()

In [None]:
def calculate_Zscore(data):
  columns = list(data.columns)
  del_columns = ["VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime",
                 "pickup_longitude", "pickup_latitude",
                 "RatecodeID", "store_and_fwd_flag",
                 "dropoff_longitude", "dropoff_latitude",
                 "improvement_surcharge", "tolls_amount",
                 "tip_amount", "mta_tax", "extra", "pickup_date", "dropoff_date", "drop_month", "pick_month"]
  columns = [col for col in columns if col not in del_columns]
  for col in columns:
    col_zscore = col + '_zscore'
    data[col_zscore] = (data[col] - data[col].mean())/data[col].std(ddof=0)
  return data

In [None]:
def calculate_BoxWhiskerScore(data):
  columns = list(data.columns)
  del_columns = ["tpep_pickup_datetime", "tpep_dropoff_datetime",
                 "pickup_longitude", "pickup_latitude",
                 "RatecodeID", "store_and_fwd_flag",
                 "dropoff_longitude", "dropoff_latitude",
                 "improvement_surcharge", "tolls_amount",
                 "tip_amount", "mta_tax", "extra", "pickup_date", "dropoff_date", "drop_month", "pick_month"]
  columns = [col for col in columns if col not in del_columns]
  for col in columns:
    col_zscore = col + '_zscore'
    data[col_zscore] = (data[col] - data[col].mean())/data[col].std(ddof=0)
  return data 

In [None]:
def main():
  download_csv_file()
  df = pd.read_csv('sample-nyc-data.csv', low_memory=False)
  # print(df.describe())
  # pre_processing(df)
  df = add_features(df)
  convert_df_to_csv(df)
  # print(df.shape)
  filtered_df = df[df["trip_duration_min"] != 0]
  # create_histogram(filtered_df)
  # stats = calculate_min_max_var_avg(filtered_df)
  # stats.to_csv('min_max_avg_var.csv')
  # create_boxplot(filtered_df)
  # create_correlation_matrix(filtered_df)
  # calculate_Zscore(filtered_df)
  # filtered_df.to_csv('extended_data_with_Zscore.csv')
  calculate_BoxWhiskerScore(filtered_df)
  filtered_df.to_csv('extended_data_with_BoxWhiskerScore.csv')
  

In [None]:
main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
