In [None]:
# Power Point Presentation file(Data Analysis on Pricing Pecision.pptx) 
# is attached for the overiew of the analysis

In [None]:
library(xlsx)
library(dplyr)
library(readxl)
library(tidyr)
library(caTools)
library(corrplot)
library(caret)
library(randomForest)

In [None]:
setwd("C:/Users/Barath/Downloads") 
Test_csv<-read_excel("Test.csv.xlsx")

In [None]:
# Converiting the Change Reason Pricing into a factor and numeric variable
# Replacing the NA values with MEAN
Test_csv<-Test_csv %>% mutate(
  reason_for_pricing=as.numeric(as.factor(Test_csv$change_reason_pricing))) %>%
  mutate_if(is.numeric, ~replace_na(.,mean(., na.rm = TRUE))) %>% as.data.frame()

summary(Test_csv)

In [None]:
#Removing the Unwanted Columns from the Data
Test_csv[,c("order_id_new","order_try_id_new","device_name","driver_app_version","rider_app_version","order_try_state",
            "calc_created","b_state","order_state","prediction_price_type","device_token","driver_device_uid_new",
            "entered_by","change_reason_pricing","ticket_id_new")]<-list(NULL)
#Checking if NA vlaues are present
sum(is.na(Test_csv))

In [None]:
#Checking the Correlation Between the Variables
corrplot(cor(Test_csv))

In [None]:
#Determining if the Outliers are present in the Data
boxplot(Test_csv[,c(colnames(Test_csv))])

In [None]:
## IF OUTLIER TREATMENT NOT NEEDED COMMENT THE WHOLE CELL


#Outlier Treatment
detect_outlier <- function(x) {
# calculate first quantile
 Quantile1 <- quantile(x, probs=.25)
 # calculate third quantile
 Quantile3 <- quantile(x, probs=.75)
 # calculate inter quartile range
 IQR = Quantile3-Quantile1
 # return true or false
 x > Quantile3 + (IQR*1.5) | x < Quantile1 - (IQR*1.5)
 }

remove_outlier <- function(dataframe,columns=names(dataframe)) {
# for loop to traverse in columns vector
 for (col in columns) {
# remove observation if it satisfies outlier function
   dataframe <- dataframe[!detect_outlier(dataframe[[col]]),]
 }
 print(dataframe)
 }

Price_Data<-remove_outlier(Test_csv,c(colnames(Test_csv)))




In [None]:
# Price_Data <-Test_csv      # Uncomment the line if not performing outlier treatment
upfront_y<-Price_Data$metered_price
# Removing the Tareget Variable from the data
Price_Data<-Price_Data[,-c(1)]

In [None]:
# Spliting the Train and Test Data
set.seed(123)
split<-createDataPartition(upfront_y,times=1,p=0.7,list=FALSE)

train<-Price_Data[split,]
test<-Price_Data[-split,]

trainy<-upfront_y[split]
testy<-upfront_y[-split]

train<-cbind(train,trainy)
test<-cbind(test,testy)

In [None]:
#Using Random Forest for Predicting the Data

trControl <- trainControl(method = "cv",
                          number = 10,
                          search = "grid")


model1<-train(trainy~., train, method = "rf", trControl = trControl, tuneGrid = NULL)
model1



In [None]:
#Removing the Upfront_price,predicted_distance,predicted_duration

trControl <- trainControl(method = "cv",
                          number = 10,
                          search = "grid")

model2<-train(trainy~duration+distance+
                gps_confidence+dest_change_number+
                eu_indicator+overpaid_ride_ticket+fraud_score+
                reason_for_pricing, train, method = "rf", trControl = trControl, tuneGrid = NULL)
model2

In [None]:
# Predicting the model1
prediction1 <- predict(model1,test)
head(prediction1)

In [None]:
# Predicting the model2
prediction2 <- predict(model2,test)
head(prediction2)


In [None]:
#Graph to check the actual vs predicted how well the model1 performed
plot(test$testy,type="l",col="green")
lines(prediction1,type="l",col="blue")

In [None]:
#Graph to check the actual vs predicted how well the model2 performed
plot(test$testy,type="l",col="green")
lines(prediction2,type="l",col="blue")


In [None]:
#Plotting Model1
plot(predict(model1,test),testy,xlab="Predicte value",ylab="Observed Value")
abline(a=0,b=1,col="red",lwd=2)

In [None]:
#Plotting Model2
plot(predict(model2,test),testy,xlab="Predicte value",ylab="Observed Value")
abline(a=0,b=1,col="red",lwd=2)

In [None]:
#Renaming and Creating the final Result of the DATA1
final_data1<-cbind(test_data,prediction1)%>% 
  relocate(testy, prediction1, .before = distance)%>% 
  rename(metered_price=testy,predicted_price=prediction1)%>% 
  select(upfront_price,metered_price,predicted_price)

In [None]:
#Renaming and Creating the final Result of the DATA2
final_data2<-cbind(test_data,prediction2)%>% 
  relocate(testy, prediction2, .before = distance)%>% 
  rename(metered_price=testy,predicted_price=prediction2)%>% 
  select(upfront_price,metered_price,predicted_price)

In [None]:
write.xlsx(final_data,"~/Result5.xlsx")