[Reference](https://erickleppen.medium.com/how-to-find-outliers-in-data-using-python-d39366ef262b)

In [1]:
#import dependencies
import pandas as pd
import numpy as np
import plotly.express as px#load the data into a dataframe
df = pd.read_csv('uber.csv')#check the first 5 rows
df.head()#drop the unnecessary columns
df = df.drop(columns=(['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']))

In [2]:
#create a histogram
fig = px.histogram(df, x='fare_amount')
fig.show()

In [3]:
#create a box plot
fig = px.box(df, y="fare_amount")
fig.show()

In [4]:
fig = px.scatter(x=df['passenger_count'], y=df['fare_amount'])
fig.show()

In [5]:
#create a function to find outliers using IQR
def find_outliers_IQR(df):
    q1=df.quantile(0.25)
    q3=df.quantile(0.75)
    IQR=q3-q1
    outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]
    return outliers

In [6]:
outliers = find_outliers_IQR(df["fare_amount"])
print("number of outliers: "+ str(len(outliers)))
print("max outlier value: "+ str(outliers.max()))
print("min outlier value: "+ str(outliers.min()))
outliers

In [7]:
outliers = find_outliers_IQR(df[["passenger_count","fare_amount"]])
outliers

In [9]:
def drop_outliers_IQR(df):
    q1=df.quantile(0.25)
    q3=df.quantile(0.75)
    IQR=q3-q1
    not_outliers = df[~((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]
    outliers_dropped = outliers.dropna().reset_index()
    return outliers_dropped

In [10]:
upper_limit = df['fare_amount'].mean() + 3*df['fare_amount'].std()
print(upper_limit)
lower_limit = df['fare_amount'].mean() - 3*df['fare_amount'].std()
print(lower_limit)

In [11]:
df['fare_amount'] = np.where(df['fare_amount'] > upper_limit,
    upper_limit,
    np.where(
        df['fare_amount'] < lower_limit,
        lower_limit,
        df['fare_amount']
    )
)

In [12]:
df.describe()[['fare_amount']]

In [13]:
def impute_outliers_IQR(df):
    q1=df.quantile(0.25)
    q3=df.quantile(0.75)
    IQR=q3-q1
    upper = df[~(df>(q3+1.5*IQR))].max()
    lower = df[~(df<(q1-1.5*IQR))].min()
    df = np.where(df > upper,
        df.mean(),
        np.where(
            df < lower,
            df.mean(),
            df
            )
        )
    return df

In [14]:
df['fare_amount'] = impute_outliers_IQR(df['fare_amount'])
df.describe()['fare_amount']