# Call me maybe

#### Mohammed Elalj, William Nouet, David Pei, Henri Tilloy

In [82]:
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from datetime import datetime
import re
from collections import OrderedDict
from sklearn import svm, preprocessing, cross_validation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import math
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

### Clean dataset

In [2]:
def clean(file):
    ads_iphone = pd.read_csv(file)
    
    ads_iphone["PRICE"] = ads_iphone["PRICE"].apply(lambda x: int(re.sub("[^0-9]", "", x)))
    ads_iphone["PRICE"] = ads_iphone["PRICE"].apply(fixPrice)
    ads_iphone["MODEL"] = ads_iphone["TITLE"].apply(getPhoneModel)
    ads_iphone["SIZE"] = ads_iphone["TITLE"].apply(getPhoneSize)
    ads_iphone["UPDATE_DATE"] = ads_iphone["UPDATE_DATE"].apply(convertDateUpdate)
    ads_iphone["POSTED_DATE"] = ads_iphone["POSTED_DATE"].apply(convertDatePosted)
    ads_iphone["SOLD_BY"] = ads_iphone["SOLD_BY"].apply(convertDateSoldby)
    
    long_model = []
    for i in range(len(ads_iphone)):
        if (str(ads_iphone['MODEL'][i]) != 'No') & (str(ads_iphone['SIZE'][i]) != 'No'):
            long_model.append(str(ads_iphone['MODEL'][i]) + ' ' + str(ads_iphone['SIZE'][i]))
        else:
            long_model.append(str(ads_iphone['MODEL'][i]))
    ads_iphone['MODEL SIZE'] = long_model

    ads_iphone.to_csv('data_iphone_cleaned.csv')

### Functions used to clean the dataset

In [3]:
#account for bad PRICE (for example, '100-200' is counted as '100200')
def fixPrice(x):
    if len(str(x)) == 5 or len(str(x)) == 6:
        #take the max of the two values
        digits = [int(i) for i in str(x)]
        digits.reverse()
        max_num = str(digits[2]) + str(digits[1]) + str(digits[0])
        #print index, max_num
        return max_num
    else:
        return x

def getPhoneModel(x):
    models = ["iphone6plus", "iphone6", "iphone5s", "iphone5c" , "iphone5", "iphone4s", "iphone4", "iphone3gs", "iphone3g", "iphone2g"]
    for model in models:
        if re.search(model, x.lower().replace(" ", "")): 
            return model
    return 'No'

def getPhoneSize(x):
    sizes = ["128", "64", "32", "16", "8"]
    for size in sizes:
        if re.search(size + "g", x.lower().replace(" ", "")): 
            return size
    return 'No'

def convertDatePosted(x):
    try:
        return datetime.strptime(x, "%Y-%m-%d  %I:%M%p").strftime("%Y-%m-%d %H:%M")
    except ValueError:
        return None

def convertDateUpdate(x):
    try:
        return datetime.strptime(x, "%Y-%m-%d %H:%M").strftime("%Y-%m-%d %H:%M")
    except ValueError:
        return None

def convertDateSoldby(x):
    try:
        return datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f").strftime("%Y-%m-%d %H:%M")
    except ValueError:
        return 'No'

### Get everything

In [4]:
def get_everything(file):
    # Load data frame
    df = pd.read_csv(file)
    # Get the cities scrapped 
    cities = np.unique(df['LOCATION'])
    # Get the number of phones per city
    phones_per_location = {}
    for city in cities:
        phones_per_location[city] = len(df[df.LOCATION == city])
        
    # Get the date of the first scrap
    date_first_scrap = df['UPDATE_DATE'][0].split(' ')[0]
    # Define the date format
    date_format = "%Y-%m-%d %H:%M"
    # Make the scrapped date fit datetime
    scrapped_date = datetime.strptime(str(date_first_scrap) + ' 23:59', date_format)
    
    # Define a dictionary to get the number of phones sold per hour
    phones_sold = {}
    # Define a dictionary to get the number of phones sold per location
    phones_sold_per_location = {}
    # Define a dictionary to get the price of phones per location
    price_phones_per_location = {}
    # Define a dictionary to get the price of phones sold per location
    price_phones_sold_per_location = {}
    
    # Go through each city
    for i in range(len(df)):
        location = df['LOCATION'][i]
        sold_date = df['SOLD_BY'][i]
        price = df['PRICE'][i]
        model = df['MODEL'][i]
        
        # Check if this location had been found already or not
        if location not in price_phones_per_location.keys():
            price_phones_per_location[location] = price
        else:
            price_phones_per_location[location] += price
        
        # Check if the phone has been sold
        if sold_date != 'No':
            # Fit datetime 
            sold_date = datetime.strptime(str(sold_date), date_format)
            diff = sold_date - scrapped_date
            days = diff.days
            seconds = diff.seconds
            # Get the number of hours between the scrapped date and the sold date
            hours = days * 24 + seconds // 3600            
            # Minus 1 hour to be coherent 
            hours = (abs(hours) - 1)
            # Check if this hour had been found already or not
            if str(hours) not in phones_sold.keys():
                phones_sold[str(hours)] = 1
            else:
                phones_sold[str(hours)] += 1
                
            # Check if this location had been found already or not
            if location not in phones_sold_per_location.keys():
                phones_sold_per_location[location] = 1
            else:
                phones_sold_per_location[location] += 1
                
            # Check if this location had been found already or not
            if location not in price_phones_sold_per_location.keys():
                price_phones_sold_per_location[location] = price
            else:
                price_phones_sold_per_location[location] += price
    
    # Get the average price of phones sold or not per location
    for city in cities:
        price_phones_per_location[city] = price_phones_per_location[city]/phones_per_location[city]
        if city in phones_sold_per_location.keys():
            price_phones_sold_per_location[city] = price_phones_sold_per_location[city]/phones_sold_per_location[city]

    return phones_per_location, phones_sold_per_location, phones_sold, price_phones_per_location, price_phones_sold_per_location

### Get nb of phones (sold or not) and their price per city and model

In [5]:
def get_everything_df(file):
    # Load data frame
    df = pd.read_csv(file)
    # Get the cities scrapped 
    cities = np.unique(df['LOCATION'])

    # List the models of iPhones with their size
    #models = ['No', 'iphone3gs 8', 'iphone4', 'iphone4 8', 'iphone4 16', 'iphone4 32', 'iphone4 64', 'iphone4 128', 'iphone4s', 'iphone4s 8', 'iphone4s 16', 'iphone4s 32', 'iphone4s 64', 'iphone4s 128', 'iphone5', 'iphone5 8', 'iphone5 16', 'iphone5 32', 'iphone5 64', 'iphone5 128', 'iphone5s', 'iphone5s 16', 'iphone5s 32', 'iphone5s 64', 'iphone5s 128', 'iphone5c', 'iphone5c 8', 'iphone5c 16', 'iphone5c 32', 'iphone5c 64', 'iphone5c 128', 'iphone6', 'iphone6 16', 'iphone6 32', 'iphone6 64', 'iphone6 128', 'iphone6plus', 'iphone6plus 16', 'iphone6plus 64', 'iphone6plus 128']
    models = pd.Series(df['MODEL'].values.ravel()).unique()
    data = pd.DataFrame({'MODEL': models})
    
    # Dataframe of price of iphones on Craigslist
    df_price = pd.DataFrame()
    df_price = df_price.append(data)
    # Dataframe of price of iphones sold
    df_price_sold = pd.DataFrame()
    df_price_sold = df_price_sold.append(data)
    # Dataframe of number of iphones on Craigslist
    df_nb = pd.DataFrame()
    df_nb = df_nb.append(data)
    # Dataframe of number of iphones sold
    df_nb_sold = pd.DataFrame()
    df_nb_sold = df_nb_sold.append(data)
    
    # Go through each location scrapped
    for city in cities:
        model_price = OrderedDict()
        model_nb = OrderedDict()
        model_sold_price = OrderedDict()
        model_sold_nb = OrderedDict()
        for key in models:
            model_price.setdefault(key, 0)
            model_nb.setdefault(key, 0)
            model_sold_price.setdefault(key, 0)
            model_sold_nb.setdefault(key, 0)
        
        for i in range(len(df)):
            location = df['LOCATION'][i]
            if location == city:
                model = df['MODEL SIZE'][i]
                sold_date = df['SOLD_BY'][i]
                price = df['PRICE'][i]
                
                if model != 'No':
                    model_nb[model] += 1
                    model_price[model] += price
                
                if sold_date != 'No':
                    model_sold_nb[model] += 1
                    model_sold_price[model] += price
        
        average_price = []
        average_price_sold = []
        for nb, price, nb_sold, sold_price in zip(model_nb.values(),model_price.values(), model_sold_nb.values(), model_sold_price.values()):
            if nb != 0:
                average_price.append(price/nb)
            else:
                average_price.append('NaN')
            if nb_sold != 0:
                average_price_sold.append(sold_price/nb_sold)
            else:
                average_price_sold.append('NaN')
        
        
        # Fill the dataframes per city
        df_price[city] = average_price
        df_price_sold[city] = average_price_sold
        df_nb[city] = model_nb.values()
        df_nb_sold[city] = model_sold_nb.values()
        
    return df_price, df_price_sold, df_nb, df_nb_sold

### Graph phones sold

In [6]:
def graph_phones_sold(phones_sold):
    plt.plot(phones_sold.keys(), phones_sold.values())
    plt.xlabel('Hour')
    plt.ylabel('Number of phones sold')
    plt.title('Phones sold per hour')
    plt.legend(loc = 'best') 
    plt.show()

### Clean Data

In [61]:
#file = 'iPhone_2015-04-28.csv'
#file1 = '3days.csv'
file2 = '7days.csv'
# Clean dataset
#clean(file1)
clean(file2)
#file = 'data_iphone_cleaned.csv'

TypeError: buffer size mismatch

### Main

In [None]:
#models = ['iphone3gs 8', 'iphone4', 'iphone4 8', 'iphone4 16', 'iphone4 32', 'iphone4 64', 'iphone4 128', 'iphone4s', 'iphone4s 8', 'iphone4s 16', 'iphone4s 32', 'iphone4s 64', 'iphone4s 128', 'iphone5', 'iphone5 8', 'iphone5 16', 'iphone5 32', 'iphone5 64', 'iphone5 128', 'iphone5s', 'iphone5s 16', 'iphone5s 32', 'iphone5s 64', 'iphone5s 128', 'iphone5c', 'iphone5c 8', 'iphone5c 16', 'iphone5c 32', 'iphone5c 64', 'iphone5c 128', 'iphone6', 'iphone6 16', 'iphone6 32', 'iphone6 64', 'iphone6 128', 'iphone6plus', 'iphone6plus 16', 'iphone6plus 64', 'iphone6plus 128']
models = pd.Series(df['MODEL'].values.ravel()).unique()
# Get everything
phones_per_location, phones_sold_per_location, phones_sold, price_phones_per_location, price_phones_sold_per_location = get_everything(file)
#print phones_sold
#print phones_per_location
#print phones_sold_per_location
#print price_phones_per_location
#print price_phones_sold_per_location

# Get dataframes of number of phones (sold) and their price per city and model
df_price, df_price_sold, df_nb, df_nb_sold = get_everything_df(file)

# Plot the number of phones sold per hour
#graph_phones_sold(phones_sold)

# Machine Learning

### Preparing Data

In [64]:
pd.set_option('display.max_rows', 3000)

data_3days = pd.read_csv('3days_cleaned.csv',index_col=0)
df_3days = pd.DataFrame()
df_3days['PRICE'] = data_3days['PRICE']
df_3days['LOCATION'] = data_3days['LOCATION']
df_3days['MODEL'] = data_3days['MODEL']
df_3days['SIZE'] = data_3days['SIZE']
df_3days['SOLD_BY'] = data_3days['SOLD_BY']
df_3days = df_3days.drop(df_3days[df_3days.LOCATION == 'sandiego'].index) #sandiego had a LOT of spam ads

data_week = pd.read_csv('7days_cleaned.csv',index_col=0)
df_week = pd.DataFrame()
df_week['PRICE'] = data_week['PRICE']
df_week['LOCATION'] = data_week['LOCATION']
df_week['MODEL'] = data_week['MODEL']
df_week['SIZE'] = data_week['SIZE']
df_week['SOLD_BY'] = data_week['SOLD_BY']
#df = df.drop(df[<some boolean condition>].index)
df_week = df_week.drop(df_week[df_week.LOCATION == 'sandiego'].index) #sandiego had a LOT of spam ads

def transform(df):
    #Normalize PRICE
    df['PRICE'] = (df['PRICE'] - df['PRICE'].mean())/df['PRICE'].std()
    #df['PRICE'] = df['PRICE'].apply(math.log)
    
    #Label Encode LOCATION
    encoderL = preprocessing.LabelEncoder()
    encoderL.fit(df['LOCATION'])
    df['LOCATION'] = encoderL.transform(df['LOCATION'])
    
    #Label Encode MODEL
    encoderM = preprocessing.LabelEncoder()
    encoderM.fit(df['MODEL'])
    df['MODEL'] = encoderM.transform(df['MODEL'])
    
    #Label Encode SIZE
    encoderS = preprocessing.LabelEncoder()
    encoderS.fit(df['SIZE'])
    df['SIZE'] = encoderS.transform(df['SIZE'])
    
    #Label Encode SOLD_BY target_label
    def convertToYes(x):
        if x != 'No':
            return 'Yes'
        else:
            return 'No'
        
    df['SOLD_BY'] = df['SOLD_BY'].apply(convertToYes)
    encoderS = preprocessing.LabelEncoder()
    encoderS.fit(df['SOLD_BY'])
    df['SOLD_BY'] = encoderS.transform(df['SOLD_BY'])
    
    return df

print df_week
df_3days = transform(df_3days)
df_week = transform(df_week)
print df_3days
print df_week

       PRICE       LOCATION        MODEL SIZE           SOLD_BY
0        450  San Francisco      iphone6   No  2015-04-30 00:06
1        325  San Francisco     iphone5s   No  2015-04-30 00:06
2        320  San Francisco     iphone5s   No                No
3        275  San Francisco      iphone5   32                No
4        580  San Francisco      iphone6   64  2015-04-30 00:06
5        200  San Francisco      iphone5   16  2015-05-02 15:52
6        550  San Francisco      iphone6   No                No
7        280  San Francisco      iphone5   16                No
8        350  San Francisco     iphone5s   32  2015-05-02 15:52
9        450  San Francisco      iphone6   16  2015-04-30 00:06
10       700  San Francisco  iphone6plus   64  2015-05-01 09:42
11       350  San Francisco     iphone5s   32  2015-05-02 15:52
12       380  San Francisco     iphone5s   64                No
13       600  San Francisco      iphone6   16                No
14       600  San Francisco      iphone6

### Create Test and Training Set

In [79]:
def predict(df):
    #classifiers and their respective results
    clf = svm.SVC()
    clf2 = svm.SVC()
    rf = RandomForestClassifier(n_estimators=100)
    ab = AdaBoostClassifier(clf2, algorithm='SAMME')
    #ab = AdaBoostClassifier(clf2, n_estimators=100, learning_rate=0.5, algorithm='SAMME',random_state=1)
    lr = LogisticRegression()
    gnb = KNeighborsClassifier()
    clf_results = []
    rf_results = []
    ab_results = []
    lr_results = []
    gnb_results = []
    feature_importance = []

    #Go through each fold of the CV
    cv = cross_validation.KFold(len(df), n_folds=10, shuffle=True)
    for train_indices, test_indices in cv:
        test = pd.DataFrame(columns=["PRICE", "LOCATION", "MODEL", "SIZE", "SOLD_BY"])
        train = df
        for i in test_indices:
            try:
                test = test.append(df.iloc[i])
                train = train.drop([i])
            except ValueError:
                continue
        Y_train = train["SOLD_BY"]
        X_train = train.drop("SOLD_BY", 1)
        Y_test = test["SOLD_BY"]
        X_test = test.drop("SOLD_BY", 1)

        #SVM
        clf.fit(X_train, Y_train)
        clf_score = clf.score(X_test, Y_test)
        clf_results.append(clf_score)
        #all_predictions.append(clf.predict(X_test))

        #RandomForestClassifier
        rf.fit(X_train, Y_train)
        rf_score = rf.score(X_test, Y_test)
        rf_results.append(rf_score)
        feature_importance.append(rf.feature_importances_)
        #all_predictions.append(rf.predict(X_test))

        #Adaboosted SVM
        ab.fit(X_train, Y_train)
        ab_score = ab.score(X_test, Y_test)
        ab_results.append(ab_score)
        #all_predictions.append(ab.predict(X_test))

        #Logistic Regression
        lr.fit(X_train, Y_train)
        lr_score = lr.score(X_test, Y_test)
        lr_results.append(lr_score)
        #all_predictions.append(lr.predict(X_test))
        
        #Gaussian Naive Bayes
        gnb.fit(X_train, Y_train)
        gnb_score = gnb.score(X_test, Y_test)
        gnb_results.append(gnb_score)
        
    return clf_results, rf_results, ab_results, lr_results, gnb_results, feature_importance

In [80]:
def average_feature_importance(feature_importance):
    result = []
    count = 0
    price_avg = 0
    location_avg = 0
    model_avg = 0
    size_avg = 0
    for x in feature_importance:
        price_avg += x[0]
        location_avg += x[1]
        model_avg += x[2]
        size_avg += x[3]
        count += 1
    price_avg = price_avg/count
    location_avg = location_avg/count
    model_avg = model_avg/count
    size_avg = size_avg/count
    result = [price_avg, location_avg, model_avg, size_avg]
    return result

In [83]:
print "3 DAYS PREDICTIONS"
clf_results, rf_results, ab_results, lr_results, gnb_results, feature_importance = predict(df_3days)
print "SVM"
#print clf_results
print np.mean(clf_results)
print "Random Forest Classifier"
#print rf_results
print np.mean(rf_results)
print "Adaboosted (SVM)"
#print ab_results
print np.mean(ab_results)
print "Logistic Regression"
#print lr_results
print np.mean(lr_results)
print "Gaussian Naive Bayes"
#print gnb_results
print np.mean(gnb_results)
print "Feature Importance for RFC (greatest to least): PRICE, LOCATION, MODEL, SIZE"
avg = average_feature_importance(feature_importance)
print "PRICE: %f" % avg[0]
print "LOCATION: %f" % avg[1]
print "MODEL: %f" % avg[2]
print "SIZE: %f" % avg[3]

print ""
print "WEEK PREDICTIONS"
clf_results, rf_results, ab_results, lr_results, gnb_results, feature_importance = predict(df_week)
print "SVM"
#print clf_results
print np.mean(clf_results)
print "Random Forest Classifier"
#print rf_results
print np.mean(rf_results)
print "Adaboosted (SVM)"
#print ab_results
print np.mean(ab_results)
print "Logistic Regression"
#print lr_results
print np.mean(lr_results)
print "Gaussian Naive Bayes"
#print gnb_results
print np.mean(gnb_results)
print "Feature Importance for RFC (greatest to least): PRICE, LOCATION, MODEL, SIZE"
avg = average_feature_importance(feature_importance)
print "PRICE: %f" % avg[0]
print "LOCATION: %f" % avg[1]
print "MODEL: %f" % avg[2]
print "SIZE: %f" % avg[3]
    

3 DAYS PREDICTIONS
SVM
0.842557768924
Random Forest Classifier
0.884811155378
Adaboosted (SVM)
0.841760956175
Logistic Regression
0.841760956175
Gaussian Naive Bayes
0.832999203187
Feature Importance for RFC (greatest to least): PRICE, LOCATION, MODEL, SIZE
PRICE: 0.455977
LOCATION: 0.303214
MODEL: 0.131821
SIZE: 0.108987

WEEK PREDICTIONS
SVM
0.628813833226
Random Forest Classifier
0.651597177332
Adaboosted (SVM)
0.640202811894
Logistic Regression
0.635799127343
Gaussian Naive Bayes
0.623297780651
Feature Importance for RFC (greatest to least): PRICE, LOCATION, MODEL, SIZE
PRICE: 0.447888
LOCATION: 0.328800
MODEL: 0.137114
SIZE: 0.086198


In [68]:
print "Baseline"
print pd.Series(df_week['SOLD_BY'].values.ravel()).value_counts()
print pd.Series(df_3days['SOLD_BY'].values.ravel()).value_counts()

Baseline
0    1742
1     979
dtype: int64
0    2112
1     397
dtype: int64


In [69]:
print 1742/(1742+979)
print 2112/(2112+397)

0.640205806689
0.841769629334


In [70]:
df_3days.to_csv('alex3days.csv')
df_week.to_csv('alex7days.csv')