In [1]:
#!/usr/bin/env python

'''

__author__ = "Bijan Vafaei"
__copyright__ = "Copyright 2019, Data Science Assessment"
__credits__ = 
__license__ = 
__version__ = "1.0.2"
__maintainer__ = 
__email__ = "bvafaei@epsteinglobal.com"
__status__ = "Prototype"

'''

# Importing required libraries

import os
import sys
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score




In [2]:
# Reading the Data from csv file and loading into a pandas dataframe

df_parking_citations = pd.DataFrame(data = pd.read_csv('input/parking_citations.corrupted.csv'))

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Data Preparation and Cleansing

# Dropping Rows With Missing Values
# Description: 
    # Drop the datapoint with Null value in the following 11 columns. These columns  have less than 40'000 null value
    # Effecting less than 1% of the whole training and validation data points

df_parking_citations = df_parking_citations.dropna(     axis = 'index', 
                                                        subset = 
                                                                ['Issue time', 
                                                                 'RP State Plate', 
                                                                 'Body Style', 
                                                                 'Color', 
                                                                 'Location',
                                                                 'Route',
                                                                 'Agency', 
                                                                 'Violation Description', 
                                                                 'Fine amount', 
                                                                 'Latitude', 
                                                                 'Longitude']
                                                  )

In [4]:
# Data Preparation and Cleansing

# Dropping Columns Meter Id, Marked Time, and VIN due to high percentage of Missing Values

df_parking_citations = df_parking_citations.drop(   columns = 
                                                                 ['Meter Id',
                                                                  'Marked Time',
                                                                  'VIN'                                            
                                                                 ]
                                                )

In [5]:
# Data Preparation and Cleansing

# Replacing Plate Expiry Date with a valid constant value
# Filling the null values with the constant of 2020-12 assuming the States have 3 years expiry date program
    
df_parking_citations['Plate Expiry Date'].fillna(value= 202012.00, inplace= True )

In [6]:
# Data Preparation and Cleansing

# Primary Key (Ticket number) contains a letter D at the end of them which 
# shows that those ticket number were deleted or meant to be deleted
# Remove those letter for the porpuse of this analytics

df_parking_citations['Ticket number'] = df_parking_citations['Ticket number'].astype(str).str.replace('D', '')

In [7]:
# Data Preparation and Cleansing

# Change the data type from object to time stamp for Issue Date
# TODO: with Issue Date we can identify day of week, weekend or weekdays, and holiday to expand 

df_parking_citations['Issue Date'] =  pd.to_datetime(df_parking_citations['Issue Date'], format='%Y-%m-%dT%H:%M:%S')

In [8]:
# Representation Transformation

# Create the additional variable 'Time of Day' to identify the ticket issuance time of day: 
# Key: AM: Morning ,  PM: Afternoon 

df_parking_citations['Time of Day'] =  [ 'AM' if x < 1200 else 'PM' for x in df_parking_citations['Issue time']]

In [11]:
# Identifying the Categorical Variables 

categorical_feature_mask = df_parking_citations.dtypes==object


# Filter categorical columns using mask and turn it into a list

categorical_cols = df_parking_citations.columns[categorical_feature_mask].tolist()

# Removing Make from the list of categorical columns to avoid labeling it
categorical_cols.remove('Make')

In [12]:
# Pre-Processing
# Representation Transformation: Converting categorical features to numeric representation (Encoding Categorical Variables)


# Define the encoder object

le = LabelEncoder()


df_encode = df_parking_citations


# Apply the lebel encoder on the categorical feature columns

df_encode[categorical_cols] = df_encode[categorical_cols].apply(lambda col: le.fit_transform(col))


In [14]:
# Pre_processing
#Normalization of the 'Fine amount'

x= df_parking_citations['Fine amount']

# Normalization to using Min-Max Scaler
df_encode['Fine amount'] = (x-x.min())/(x.max()-x.min())

In [15]:
# Index selection: The uncorrupted subset of the input goes for training and testing
# The  data-points with the Make feature will be used for training and testing

df_train_test = df_encode.dropna(axis = 'index', subset = ['Make'])



# Separating the data-points that the Make feature has been accidentally deleted and need to be predicted

df_predict = df_encode[df_encode['Make'].isnull()]


In [16]:
# Aggregation: 
# Identifying the top 25 common Make using groupby and aggregation methods

df_common_make = df_train_test[['Ticket number','Make']].groupby(['Make']) \
                                                            .agg('count') \
                                                            .sort_values('Ticket number', ascending = False) \
                                                            .head(25)\
                                                            .reset_index()\
                                                            
l_common_make = df_common_make['Make'].tolist()

df_train_test ['Common make'] = [1 if x in l_common_make else 0 for x in df_train_test['Make']]


df_predict['Common make'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [17]:
# Pre-processing
# Split data into training and testing sets

X_train_test = df_train_test.drop(['Issue Date','Make', 'Common make'],axis =1)
y_train_test = df_train_test['Common make']

# Returning labeled and prediction features for training and test
# Library from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train_test,y_train_test, random_state = 200, test_size=0.2)

In [18]:
# Returning labeled and prediction features for the subset of the input that requires the Make feature to be predicted

X_predict = df_predict.drop(['Issue Date','Make', 'Common make'],axis =1)
y_predict = df_predict['Common make']

In [19]:
# KNN Model

# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 100)

# Fit the classifier to the data
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=100, p=2,
           weights='uniform')

In [None]:
# Checking the K-fold Cross Validation instead of train-test_split method

kfold = KFold(n_splits=5, random_state=None)

#train model with cv of 5 
cv_scores = cross_val_score(knn, X_train_test, y_train_test, cv=kfold)

In [278]:
# Prediction of new observation

y1_predict = knn.predict(X1_predict)

array([0], dtype=int64)

In [279]:
# calculating the probability of the new observation being among top 25 common Make
proba_predict1 = knn.predict_proba(X1_predict)

In [None]:
# The probability of the new observation being made by one of the top 25 common Make
Common_make_proba_predict1 = proba_predict1.tolist()