In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

In [None]:
dataset = pd.read_csv("UberDataset.csv")
dataset.head()

In [None]:
dataset.shape

dataset.info()

In [4]:
dataset.fillna({'PURPOSE': "NOT"}, inplace=True)

In [5]:
dataset['START_DATE'] = pd.to_datetime(dataset['START_DATE'], 
									errors='coerce')
dataset['END_DATE'] = pd.to_datetime(dataset['END_DATE'], 
									errors='coerce')

In [6]:
from datetime import datetime

dataset['date'] = pd.DatetimeIndex(dataset['START_DATE']).date
dataset['time'] = pd.DatetimeIndex(dataset['START_DATE']).hour

dataset['day-night'] = pd.cut(x=dataset['time'],
							bins = [0,10,15,19,24],
							labels = ['Morning','Afternoon','Evening','Night'])

In [7]:
dataset.dropna(inplace=True)

dataset.drop_duplicates(inplace=True)

In [None]:
obj = (dataset.dtypes == 'object')
object_cols = list(obj[obj].index)

unique_values = {}
for col in object_cols:
    unique_values[col] = dataset[col].unique().size
unique_values

In [None]:
plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
sns.countplot(dataset['CATEGORY'])
plt.xticks(rotation=90)

plt.subplot(1,2,2)
sns.countplot(dataset['PURPOSE'])
plt.xticks(rotation=90)

In [None]:
sns.countplot(dataset['day-night'])
plt.xticks(rotation=90)

In [None]:
plt.figure(figsize=(15, 5))
sns.countplot(data=dataset, x='PURPOSE', hue='CATEGORY')
plt.xticks(rotation=90)
plt.show()

In [12]:
object_cols = ['CATEGORY', 'PURPOSE']
OH_encoder = OneHotEncoder(sparse_output=False)
OH_cols = pd.DataFrame(OH_encoder.fit_transform(dataset[object_cols]))
OH_cols.index = dataset.index

OH_cols.columns = OH_encoder.get_feature_names_out(input_features=object_cols)
df_final = dataset.drop(object_cols, axis=1)
dataset = pd.concat([df_final, OH_cols], axis=1)


In [None]:
numeric_cols = dataset.select_dtypes(include=[float, int]).columns

corr_matrix = dataset[numeric_cols].corr()

plt.figure(figsize=(12, 6))
sns.heatmap(corr_matrix, 
            cmap='BrBG', 
            annot=True, 
            fmt='.2f', 
            linewidths=2)
plt.show()

In [None]:
dataset['MONTH'] = pd.DatetimeIndex(dataset['START_DATE']).month
month_label = {1.0: 'Jan', 2.0: 'Feb', 3.0: 'Mar', 4.0: 'April',
			5.0: 'May', 6.0: 'June', 7.0: 'July', 8.0: 'Aug',
			9.0: 'Sep', 10.0: 'Oct', 11.0: 'Nov', 12.0: 'Dec'}
dataset["MONTH"] = dataset.MONTH.map(month_label)

mon = dataset.MONTH.value_counts(sort=False)

df = pd.DataFrame({"MONTHS": mon.values,
				"VALUE COUNT": dataset.groupby('MONTH',
												sort=False)['MILES'].max()})

p = sns.lineplot(data=df)
p.set(xlabel="MONTHS", ylabel="VALUE COUNT")

In [None]:
dataset['START_DATE'] = pd.to_datetime(dataset['START_DATE'])

dataset['DAY'] = dataset['START_DATE'].dt.day_name()

day_label = dataset['DAY'].value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=day_label.index, y=day_label.values)
plt.xlabel('DAY')
plt.ylabel('COUNT')
plt.title('Frequency of Each Day in the Dataset')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=dataset['MILES'])

plt.xlabel('MILES')
plt.title('Box Plot of MILES')
plt.show()

In [None]:
filtered_data = dataset[dataset['MILES'] < 100]

plt.figure(figsize=(10, 6))
sns.boxplot(x=filtered_data['MILES'])

plt.xlabel('MILES')
plt.title('Box Plot of MILES (Less than 100)')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(dataset[dataset['MILES'] < 40]['MILES'], kde=True)

# Adding labels and a title
plt.xlabel('MILES')
plt.ylabel('Density')
plt.title('Distribution of MILES (Less than 40)')
plt.show()