# Fix Data

importing all the needed libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as p
import math
import re

%matplotlib inline


Reading the file from the csv file

In [None]:
gigs = pd.read_csv('gigs_data.csv', encoding='latin-1')
print(gigs.head())
len(gigs)

Removing the duplicates from the data frame. Duplicates might cause problems in the future, and we don't need them.

In [None]:
gigs.drop_duplicates(inplace=True)

The price of each gig was in NIS, when converting to utf-8, it added a weird symbol that we had to remove.
In addition to that, we saved the data in a list, and the csv file converted it to a string. We had to undo it, and transform it back to a list of floats.

In [None]:
# remove the weird 'âª' character from the prices column 
gigs['prices'] = gigs['prices'].str.replace('âª', '')

def convert_prices(prices_str):
    '''Converts a string of prices to a list of floats'''
    prices_str = prices_str.strip('[]').split(', ')
    prices_float = []
    for price in prices_str:
        price = price.replace(',', '').replace("'", "")
        prices_float.append(float(price))
    return prices_float

gigs['prices'] = gigs['prices'].apply(convert_prices)

gigs.head()


While crawling, the browser of one of us was in Hebrew, which cause some letter to be in utf-8 Hebrew version. We had to make a dictionery for each utf-8 Hebrew sign, and then convert it to a normal date format.

In [None]:
# Convert member_since column (3 first letter of month-last 2 digit of year, Ex: Sep-19) to epoch time
def convert_member_since(member_since_str):
    '''Converts a string of member_since to epoch time'''
    member_since_str = member_since_str.split('-')
    if (len(member_since_str) == 1):
        return None
    month = member_since_str[0]
    year = member_since_str[1]
    month_dict = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05',
                 'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10',
                 'Nov': '11', 'Dec': '12', 'éåð': '06', 'àå÷': '10', 'éðå': '01', 'àôø': '04', 'îøõ': '03', 'ðåá': '11', 'ñôè':'09', 'îàé':'05', "éåì":'07', "ãöî":'12', "ôáø": '02', "àåâ": '08'}
    month = month_dict[month]
    member_since_str = '20' + year + '-' + month + '-01'
    return pd.to_datetime(member_since_str)

gigs['member_since'] = gigs['member_since'].apply(convert_member_since)

gigs.head()

The recisions were also stored in a list. But the csv format made it a string of string inside a list. 
We had to convert it to a normal list of integers, and replace the word "Unlimited" with np.inf (infinite).

In [None]:
def convert_revisions(revisions_str):
    '''Converts a string of revisions to a list of floats'''
    revisions_str = revisions_str.strip('[]').split(', ')
    revisions_float = []
    for revision in revisions_str:
        revision = revision.replace(',', '').replace("'", "")
        if revision == 'Unlimited':
            revision = np.inf
            revisions_float.append(float(revision))
        else:
            if revision == '':
                revision = 0
            revisions_float.append(int(revision))
    return revisions_float

gigs['revisions'] = gigs['revisions'].apply(convert_revisions)

gigs.head()

The delivery times were also converted to a string beacause of the csv file format.
In addition to that we had to remove the word "day" or "days" from the end of each number.

In [None]:
gigs['delivery_times'] = gigs['delivery_times'].str.replace('day', '')
gigs['delivery_times'] = gigs['delivery_times'].str.replace("s", "")
gigs['delivery_times'] = gigs['delivery_times'].str.strip()

def convert_delivery_time(delivery_time_str):
    '''Converts a string of prices to a list of floats'''
    delivery_time_str = delivery_time_str.strip('[]').split(', ')
    delivery_times_int = []
    for price in delivery_time_str:
        price = price.replace(',', '').replace("'", "")
        delivery_times_int.append(int(price))
    return delivery_times_int

gigs['delivery_times'] = gigs['delivery_times'].apply(convert_delivery_time)

gigs.head()


We made a dictionery that converts each level to a number.

In [10]:
def convert_level(level_str):
    level_conversion = {'Top Rated Seller': 3, 'Level 2 Seller': 2, 'Level 1 Seller': 1, 'New Seller': 0, '0': 0}
    return level_conversion[level_str]

gigs['seller_level'] = gigs['seller_level'].apply(convert_level)

gigs.head()

KeyError: '0'

In [None]:
# save new dataframe to csv called gigs_data_cleaned.df as pickle
gigs.to_pickle('gigs_data_cleaned.df')