# Import / Preprocessing

In [72]:
import pandas as pd
import numpy as np
import math
from sklearn import preprocessing

In [73]:
# Read in the datas
training = pd.read_csv("./train.csv")
testing = pd.read_csv("./test.csv")

In [74]:
# There are 116 categorical columns of the form: cat1-cat116
catCols = ["cat" + str(x) for x in range(1, 117)]

# There are 14 continuous columns of the form cont1-cont14
contCols = ["cont" + str(x) for x in range(1,15)]

In [75]:
# Create label encoder model dictionary and populate dict with label encoding models
leDict = {}
for catCol in catCols:
    leDict[catCol] = preprocessing.LabelEncoder().fit(pd.concat([training[catCol], testing[catCol]]))

In [76]:
# Apply encoding models to categorical variables
training[catCols] = training[catCols].apply(lambda x: leDict[x.name].transform(x))
testing[catCols] = testing[catCols].apply(lambda x: leDict[x.name].transform(x))

In [79]:
def splitDF(df):
    # create and random permutation of indices
    indices = np.random.permutation(df.shape[0])

    # split permutation in two
    numInd = len(indices)
    trainingInds = indices[:math.ceil(numInd/2)]

    # split df using permutations
    inds = df.index.isin(trainingInds)
    training = df[inds]
    testing = df[~inds]

    # double check shape
    print(testing.shape)
    print(training.shape)
    
    (testing, training)

# SGDRegressor

In [56]:
print(__doc__)
%matplotlib inline
import numpy as np
from sklearn.svm import SVR
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

Automatically created module for IPython interactive environment


## First we need to scale all of the data

In [81]:
scaler = StandardScaler()
scaler.fit(training.drop(["loss"], axis=1))  # Don't cheat - fit only on training data
trScale = scaler.transform(training.drop(["loss"], axis=1))
teScale = scaler.transform(testing)  # apply same transformation to test data

In [87]:
print(trScale.shape)
print(trScale[:,1:].shape)

(188318, 131)
(188318, 130)


In [88]:
from sklearn import linear_model

clf = linear_model.SGDRegressor()
clf_model = clf.fit(trScale[:,1:], training["loss"])

testing["loss"] = clf_model.predict(teScale[:,1:])

TODO:
- Read in actual training/testing data
- Run SGD on full data

In [91]:
testing[["id","loss"]].to_csv("./submission1.csv")

In [63]:
testing.columns

Index(['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9',
       'cat10',
       ...
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'cont14', 'loss'],
      dtype='object', length=131)

In [27]:
testing.drop("loss", axis=1).shape

(94159, 132)

In [28]:
training.drop("loss", axis=1).shape

(94159, 131)

In [29]:
training.shape

(94159, 132)

In [30]:
testing.shape

(94159, 133)

In [36]:
training.drop("loss", axis=1).head()

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
4,0,1,0,1,0,0,0,0,1,1,...,0.704268,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606
9,0,1,0,0,1,1,0,0,1,0,...,0.310061,0.713343,0.469223,0.3026,0.67135,0.8351,0.863052,0.879347,0.822493,0.294523
15,0,0,0,0,1,1,0,0,0,0,...,0.281143,0.570733,0.547756,0.80438,0.44352,0.63026,0.385085,0.377003,0.51666,0.340325
16,0,1,1,0,0,0,0,0,1,1,...,0.525831,0.411902,0.593548,0.31796,0.38846,0.48889,0.457203,0.447145,0.301535,0.205651
17,0,0,0,0,0,1,0,0,0,0,...,0.551723,0.688705,0.437192,0.67263,0.83505,0.59334,0.678924,0.665644,0.684242,0.407411
