In [None]:
#import os
#os.chdir("Applied Data Science\project2\code")
#os.getcwd()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
#from scipy.stats import chi2_contingency

In [None]:
df = pd.read_feather('data/clean.feather')
df

In [None]:
df.shape

In [None]:
corr_attr =['trip_distance', 'fare_amount', 'tip_amount', 'duration']
corr = df[corr_attr].corr()
sns.heatmap(corr,cmap = sns.diverging_palette(220, 10, as_cmap=True), square=True, center=0)
#plt.title('Pearson correlation matrix')
plt.savefig('plots/correlation.png')
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 7))
plt.subplot(221)  
sns.distplot(df['trip_distance'], kde = False, label = "trip_distance", color ="blue") 
plt.title("Distribution of Trip distance")
plt.xlabel('Trip distance (mile)')
plt.ylabel("Density")

plt.subplot(222)
sns.distplot(df['tip_amount'], kde = False, label = "tip_amount", color ="blue")
plt.title("Distribution of Tip amount")
plt.xlabel('Tip amount (USD)')
plt.ylabel("Density")

plt.subplot(223) 
sns.distplot(df['duration'], kde = False, label = "duration", color ="blue")
plt.title("Distribution of Duration")
plt.xlabel('Duration (min)')
plt.ylabel("Density")

plt.subplot(224) 
sns.distplot(df['revenue'], kde = False, label = "revenue", color ="blue")
plt.title("Distribution of Revenue")
plt.xlabel('Revenue (USD/hour)')
plt.ylabel("Density")

plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(20, 10))
plt.subplot(221)  
stats.probplot(df['trip_distance'], dist="norm", plot=plt)
plt.title("Normal Q-Q Plot for Trip distance")
plt.xlabel('Theoretical Quantiles')
plt.ylabel("Sample Quantiles")

plt.subplot(222) 
stats.probplot(df['tip_amount'], dist="norm", plot=plt)
plt.title("Normal Q-Q Plot for Tip amount")
plt.xlabel('Theoretical Quantiles')
plt.ylabel("Sample Quantiles")

plt.subplot(223) 
stats.probplot(df['duration'], dist="norm", plot=plt)
plt.title("Normal Q-Q Plot for Duration")
plt.xlabel('Theoretical Quantiles')
plt.ylabel("Sample Quantiles")

plt.subplot(224) 
stats.probplot(df['revenue'], dist="norm", plot=plt)
plt.title("Normal Q-Q Plot for Revenue")
plt.xlabel('Theoretical Quantiles')
plt.ylabel("Sample Quantiles")

plt.tight_layout()
plt.show()

# Transform

In [None]:
def check_if_0(x):
    if x==0:
        return 0.00001
    else:
        return x

df['tip_amount'] = df['tip_amount'].map(check_if_0)

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, train_size=200000, random_state=60, shuffle=True)
train.reset_index(inplace = True, drop=True)
test.reset_index(inplace = True, drop=True)

In [None]:
train.to_feather('data/train.feather')
test.to_feather('data/test.feather')

In [None]:
lambda_list = []
continuous_col = ['trip_distance', 'fare_amount', 'tip_amount','duration', 'revenue']
for i in continuous_col:
    out = stats.boxcox(train[i], lmbda=None)
    train[i] = out[0]
    lambda_list.append(out[1])


In [None]:
fig = plt.figure(figsize=(10, 7))
plt.subplot(221)  
sns.distplot(train['trip_distance'], kde = False, label = "trip_distance", color ="blue")
plt.title("Distribution of transformed Trip distance")
plt.xlabel('Trip distance (mile)')
plt.ylabel("Density")

plt.subplot(222) 
sns.distplot(train['tip_amount'], kde = False, label = "tip_amount", color ="blue")
plt.title("Distribution of transformed Tip amount")
plt.xlabel('Tip amount (USD)')
plt.ylabel("Density")

plt.subplot(223) 
sns.distplot(train['duration'], kde = False, label = "duration", color ="blue")
plt.title("Distribution of transformed Duration")
plt.xlabel('Duration (min)')
plt.ylabel("Density")

plt.subplot(224) 
sns.distplot(train['revenue'], kde = False, label = "income", color ="blue")
plt.title("Distribution of transformed Revenue")
plt.xlabel('Revenue (USD/hour)')
plt.ylabel("Density")


plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(20, 10))
plt.subplot(221)  
stats.probplot(train['trip_distance'], dist="norm", plot=plt)
plt.title("Normal Q-Q Plot for transformed Trip distance")
plt.xlabel('Theoretical Quantiles')
plt.ylabel("Sample Quantiles")

plt.subplot(222) 
stats.probplot(train['tip_amount'], dist="norm", plot=plt)
plt.title("Normal Q-Q Plot for transformed Tip amount")
plt.xlabel('Theoretical Quantiles')
plt.ylabel("Sample Quantiles")

plt.subplot(223) 
stats.probplot(train['duration'], dist="norm", plot=plt)
plt.title("Normal Q-Q Plot for transformed Duration")
plt.xlabel('Theoretical Quantiles')
plt.ylabel("Sample Quantiles")

plt.subplot(224) 
stats.probplot(train['revenue'], dist="norm", plot=plt)
plt.title("Normal Q-Q Plot for transformed Revenue")
plt.xlabel('Theoretical Quantiles')
plt.ylabel("Sample Quantiles")

plt.tight_layout()
plt.show()

In [None]:
for i,j  in enumerate(continuous_col):
    out = stats.boxcox(test[j], lmbda=lambda_list[i])
    test[j] = out



In [None]:
fig = plt.figure(figsize=(10, 7))
plt.subplot(221)
sns.distplot(test['trip_distance'], kde = False, label = "trip_distance", color ="blue")
plt.title("Distribution of transformed Trip distance")
plt.xlabel('Trip distance (mile)')
plt.ylabel("Density")

plt.subplot(222)
sns.distplot(test['tip_amount'], kde = False, label = "tip_amount", color ="blue")
plt.title("Distribution of transformed Tip amount")
plt.xlabel('Tip amount (USD)')
plt.ylabel("Density")
 
plt.subplot(223)
sns.distplot(test['duration'], kde = False, label = "duration", color ="blue")
plt.title("Distribution of transformed Duration")
plt.xlabel('Duration (min)')
plt.ylabel("Density")

plt.subplot(224)
sns.distplot(test['revenue'], kde = False, label = "revenue", color ="blue")
plt.title("Distribution of transformed Revenue")
plt.xlabel('Revenue (USD/hour)')
plt.ylabel("Density")


plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(20, 10))
plt.subplot(221)
stats.probplot(test['trip_distance'], dist="norm", plot=plt)
plt.title("Normal Q-Q Plot for tansformed trip distance")
plt.xlabel('Theoretical Quantiles')
plt.ylabel("Sample Quantiles")

plt.subplot(222)
stats.probplot(test['tip_amount'], dist="norm", plot=plt)
plt.title("Normal Q-Q Plot for transformed Tip amount")
plt.xlabel('Theoretical Quantiles')
plt.ylabel("Sample Quantiles")

plt.subplot(223)
stats.probplot(test['duration'], dist="norm", plot=plt)
plt.title("Normal Q-Q Plot for transformed Duration")
plt.xlabel('Theoretical Quantiles')
plt.ylabel("Sample Quantiles")

plt.subplot(224)
stats.probplot(test['revenue'], dist="norm", plot=plt)
plt.title("Normal Q-Q Plot for transformed Revenue")
plt.xlabel('Theoretical Quantiles')
plt.ylabel("Sample Quantiles")

plt.tight_layout()
plt.show()


In [None]:
train.to_csv('data/train_scaled.csv', index=False)
train.to_feather('data/train_scaled.feather')
test.to_csv('data/test_scaled.csv', index=False)
test.to_feather('data/test_scaled.feather')


In [None]:
lambda_list =  pd.DataFrame ([lambda_list],columns=continuous_col)
lambda_list

In [None]:
lambda_list.to_csv('data/lambda_list.csv')