In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
path = Path('../input/telematics')
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
sns.set_style('darkgrid')
import random
import os
import pandas_profiling as pp

In [2]:
matplotlib.rcParams['figure.figsize'] = (12, 6)
matplotlib.rcParams['font.size'] = 15
random.seed(0)
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

In [3]:
os.listdir(path)

In [4]:
sample= pd.read_csv(path / 'sample_trips.csv')
simulated = pd.read_csv(path / 'simulated_summary_total.csv')

In [5]:
sample.info()

In [6]:
sample.head()

In [7]:
print(sample.trip_nb.value_counts())

In [8]:
sns.barplot(x=sample.trip_nb.value_counts().index, y=sample.trip_nb.value_counts(), palette = sns.color_palette('viridis'))
plt.xticks(rotation = 45)
plt.title('trip_id vs frequency')
plt.xlabel('trips')
plt.ylabel('frequency')
plt.show()

In [9]:
print(f'Statistical description of Latitude Column')
print(sample['latitude'].describe())
print()
print(f'Statistical description of Longitude Column')
print(sample['longitude'].describe())

In [10]:
list(sample.dtypes)

In [11]:
n = len(sample)
trip_nb_missing = sum(sample['trip_nb'].isnull())
local_dtm_missing = sum(sample['local_dtm'].isnull())
latitude_missing = sum(sample['latitude'].isnull())
longitude_missing = sum(sample['longitude'].isnull())

print(f'Percentage of Missing Values in trip_nb column {(trip_nb_missing/n) * 100} %')
print(f'Percentage of Missing Values in local_dtm column {(local_dtm_missing /n) * 100} %')
print(f'Percentage of Missing Values in latitude column {(latitude_missing /n) * 100} %')
print(f'Percentage of Missing Values in longitude column {(longitude_missing /n) * 100} %')

In [12]:
duplicatedRows = sample[sample.duplicated()]
print('Duplicated Rows in the Dataset are :')
print(duplicatedRows)

In [13]:
df = sample[sample['local_dtm'] == '20MAY17:11:53:50']
print(df)
df = sample[sample['local_dtm'] == '22MAY17:17:01:33']
print(df)

In [14]:
print('Percentage of Duplicated Rows :{:.2f}%'.format((len(duplicatedRows)/n)*100))

In [15]:
#Checking cardinality of data columns
sample.nunique()

In [17]:
sample['local_dtm'][1]

In [18]:
def date_preprocess(string):
    date, time = string.split(':', 1)
    return pd.to_datetime(date + ' ' + time)

In [19]:
sample['local_dtm'] = sample['local_dtm'].apply(date_preprocess)

In [20]:
sample.head()

In [21]:
cov_matrix = sample.cov()
cmap = plt.cm.RdBu
sns.heatmap(cov_matrix, linewidths = 0.2, vmax = 1.0, vmin = -1., square = True, cmap = cmap, linecolor = 'white', annot = True, fmt = '.2f')
plt.title('Covariance Matrix')
plt.show()

In [22]:
corr_matrix = sample.corr()
cmap = plt.cm.RdBu
sns.heatmap(corr_matrix, linewidths = 0.2, vmax = 1.0, vmin = -1., square = True, cmap = cmap, linecolor = 'white', annot = True, fmt = '.2f')
plt.title('Correlation Matrix')
plt.show()

In [23]:
sample.describe()

In [24]:
from scipy.stats import pearsonr, spearmanr
long_values = sample['longitude'].values
lat_values = sample['latitude'].values
corr, _ = pearsonr(long_values, lat_values)
print('Pearsons correlation between latitude and longitude: %.3f' % corr)
corr, _ = spearmanr(long_values, lat_values)
print('Spearmans correlation between latitude and longitude: %.3f'%corr)

In [25]:
sample.skew()

In [26]:
sample.kurtosis()

In [27]:
pp.ProfileReport(sample)