In [51]:
# importing modules and relevant data set

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # Visualization
import seaborn as sns
import os

df = pd.read_csv('auction.csv')
df.head()

Unnamed: 0,auctionid,bid,bidtime,bidder,bidderrate,openbid,price,item,auction_type
0,1638843936,500.0,0.478368,kona-java,181.0,500.0,1625.0,Cartier wristwatch,7 day auction
1,1638843936,800.0,0.826389,doc213,60.0,500.0,1625.0,Cartier wristwatch,7 day auction
2,1638843936,600.0,3.761123,zmxu,7.0,500.0,1625.0,Cartier wristwatch,7 day auction
3,1638843936,1500.0,5.226377,carloss8055,5.0,500.0,1625.0,Cartier wristwatch,7 day auction
4,1638843936,1600.0,6.570625,jdrinaz,6.0,500.0,1625.0,Cartier wristwatch,7 day auction


##### The variables involved in the data set are as follows:

* **auctionid:** unique id for each auction
* **bid:** proxy bid placed by bidder
* **bidtime:** time bet is placed in relation to total auction length
* **bidder:** bidder username
* **biderrate:** bidder's rating on ebay
* **openbid:** opening bid priced by seller
* **item:** name of item
* **auction_type:** type of auction (3day, 5day, 7day)




##### Checking the dataset for null values and adderssing said null values

In [3]:
df.isnull().sum()

auctionid        0
bid              0
bidtime          0
bidder          16
bidderrate      11
openbid          0
price            0
item             0
auction_type     0
dtype: int64

In [4]:
print('Shape of entries with null values: ', df[(df.bidderrate.isnull()) | df.bidder.isnull()].shape)

Shape of entries with null values:  (27, 9)


#### Replace null values with the mean of data column (Null Username --> Private, Null Bidderrate --> Mean (32))

In [5]:
df.bidder.fillna('Private', inplace=True)
df.bidderrate.fillna(32, inplace=True)

##### Taking a look at total unique values in each attribute

In [5]:
df.nunique()

auctionid         628
bid              1858
bidtime         10400
bidder           3387
bidderrate        301
openbid           109
price             318
item                3
auction_type        3
dtype: int64

##### The total number of auction is the most valuable data and we will try to derive more information from it, we would also like to find out if each type of aution only caters to a specific kind of item

In [52]:
#initializing secondary dataframe derived from original dataset to produce more detailed values
auctions_df = pd.DataFrame({
                'bid_count':df.groupby('auctionid').bid.count(),
                'opening_price':df.groupby('auctionid').openbid.mean(),
                'closing_price':df.groupby('auctionid').price.mean()
             })

In [65]:
def get_item(auctionid_input): 
    return df[df.auctionid == auctionid_input]['item'].unique()[0]
def get_auction_type(auctionid_input): 
    return df[df.auctionid == auctionid_input]['auction_type'].unique()[0]

# create new columns corresponding to auction id's item sold and auction type
auctions_df.reset_index(inplace=True)
auctions_df['profit'] = auctions_df['closing_price'] - auctions_df['opening_price']
auctions_df['profit(%)'] = (auctions_df['closing_price'] / auctions_df['opening_price']) * 100
auctions_df['item'] = auctions_df['auctionid'].apply(get_item)
auctions_df['auction_type'] = auctions_df['auctionid'].apply(get_auction_type)

In [66]:
writer = pd.ExcelWriter('auction.xlsx')
df.to_excel(writer, 'raw', index=False)
auctions_df.to_excel(writer, 'auctions', index=False)
writer.save()

###### TODO:
- auctions_df: new column ['total bidder'] quantifying number of unique bidders involved
- new dataset bidders_df ['total bids', 'average bids', 'won_bids']