In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model, svm
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
import sklearn.metrics as sklm
from dateutil.relativedelta import relativedelta

%matplotlib inline

In [15]:
work_cust = pd.read_csv('work_cust.csv',index_col='CustomerID')
work_cust.drop(['Unnamed: 0','dummy','AddressLine1','FirstName','LastName','StateProvinceName',
                'PhoneNumber','BirthDate','PostalCode'], axis = 1, inplace = True)
work_cust.head().transpose()

CustomerID,11000,11001,11002,11003,11004
City,Rockhampton,Seaford,Hobart,North Ryde,Wollongong
CountryRegionName,Australia,Australia,Australia,Australia,Australia
Education,Bachelors,Bachelors,Bachelors,Bachelors,Bachelors
Occupation,Professional,Professional,Professional,Professional,Professional
Gender,M,M,M,F,F
MaritalStatus,M,S,M,S,S
HomeOwnerFlag,1,0,1,0,1
NumberCarsOwned,0,1,1,1,4
NumberChildrenAtHome,0,3,3,0,5
TotalChildren,2,3,3,0,5


In [16]:
# Label
labels = np.array(work_cust['AveMonthSpend'])
labels.shape

(16404,)

In [17]:
# Encode categorical data
def encode_string(cat_features):
    
    ## First encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_features)
    enc_cat_features = enc.transform(cat_features)
    
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

categorical_columns = ['Education','Occupation',
                       'Gender','MaritalStatus','HomeOwnerFlag']

Features = encode_string(work_cust['CountryRegionName'])

for col in categorical_columns:
    temp = encode_string(work_cust[col])
    Features = np.concatenate([Features, temp], axis = 1)
    
print(Features.shape)

(16404, 22)


In [18]:
# Add numerical feature
Features = np.concatenate([Features, np.array(work_cust[['NumberCarsOwned',
                                                        'NumberChildrenAtHome',
                                                        'TotalChildren',
                                                        'YearlyIncome','Age']])], axis = 1)

print(Features.shape)

(16404, 27)


The code in the cell below performs the following processing:
1. An index vector is Bernoulli sampled using the `train_test_split` function from the `model_selection` package of scikit-learn. 
2. The first column of the resulting index array contains the indices of the samples for the training cases. 
3. The second column of the resulting index array contains the indices of the samples for the test cases. 

In [19]:
## Randomly sample cases to create independent training and test data
nr.seed(9988)
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 0.3)
X_train = Features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
X_test = Features[indx[1],:]
y_test = np.ravel(labels[indx[1]])

In [20]:
scaler = preprocessing.StandardScaler().fit(X_train[:,23:])
X_train[:,23:] = scaler.transform(X_train[:,23:])
X_test[:,23:] = scaler.transform(X_test[:,23:])

#### Construct the logistic regression model

In [21]:
lin_mod = linear_model.LinearRegression(fit_intercept = False)
lin_mod.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [27]:
ads_test = pd.read_csv('./FinalExam-Test/AW_test.csv')
ads_test.head()

Unnamed: 0,CustomerID,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,...,BirthDate,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome
0,18988,,Courtney,A,Baker,,8727 Buena Vista Ave.,,Fremont,California,...,1/5/1945,Bachelors,Management,F,S,0,2,0,5,86931
1,29135,,Adam,C,Allen,,3491 Cook Street,,Haney,British Columbia,...,10/4/1964,Bachelors,Skilled Manual,M,M,1,2,2,4,100125
2,12156,,Bonnie,,Raji,,359 Pleasant Hill Rd,,Burbank,California,...,1/12/1934,Graduate Degree,Management,F,M,1,2,0,4,103985
3,13749,,Julio,C,Alonso,,8945 Euclid Ave.,,Burlingame,California,...,9/22/1958,Graduate Degree,Skilled Manual,M,M,1,0,0,4,127161
4,27780,,Christy,A,Andersen,,"42, boulevard Tremblay",,Dunkerque,Nord,...,3/19/1965,High School,Manual,F,M,1,1,2,2,21876


In [28]:
# Calculate age of customer based on birthdate
def calculate_age(end):
    r = relativedelta(pd.to_datetime('now'), pd.to_datetime(end)) 
    return '{}'.format(r.years)


ads_test['Age'] = ads_test['BirthDate'].apply(calculate_age)

# Convert age to integer
ads_test['Age'] = ads_test['Age'].astype('int64')

In [29]:
ads_totest = ads_test[['CountryRegionName','Education','Occupation','Gender',
                      'MaritalStatus','HomeOwnerFlag','NumberCarsOwned',
                      'NumberChildrenAtHome','TotalChildren','YearlyIncome',
                      'Age']]

ads_totest.head()

Unnamed: 0,CountryRegionName,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,Age
0,United States,Bachelors,Management,F,S,0,2,0,5,86931,73
1,Canada,Bachelors,Skilled Manual,M,M,1,2,2,4,100125,54
2,United States,Graduate Degree,Management,F,M,1,2,0,4,103985,84
3,United States,Graduate Degree,Skilled Manual,M,M,1,0,0,4,127161,60
4,France,High School,Manual,F,M,1,1,2,2,21876,53


In [30]:
# Encode categorical data
def encode_string(cat_features):
    
    ## First encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_features)
    enc_cat_features = enc.transform(cat_features)
    
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

categorical_columns = ['Education','Occupation',
                       'Gender','MaritalStatus','HomeOwnerFlag']

Features = encode_string(ads_totest['CountryRegionName'])

for col in categorical_columns:
    temp = encode_string(ads_totest[col])
    Features = np.concatenate([Features, temp], axis = 1)
    
print(Features.shape)

(500, 22)


In [31]:
# Add numerical feature
Features = np.concatenate([Features, np.array(ads_totest[['NumberCarsOwned',
                                                        'NumberChildrenAtHome',
                                                        'TotalChildren',
                                                        'YearlyIncome','Age']])], axis = 1)

print(Features.shape)

(500, 27)


In [32]:
X_test = Features
X_test[:,23:] = scaler.transform(X_test[:,23:])

In [33]:
scores = lin_mod.predict(X_test)
print(scores)

[ 43.0078125  106.89453125  49.29882812  88.84375     60.85913086
  43.3046875   95.84375    127.51171875 103.17578125  56.5546875
  59.45507812  51.15234375  72.796875    46.96386719  38.04589844
  51.63671875  86.08984375  73.3203125  112.265625    59.62109375
  68.921875    76.0546875  149.7421875   85.171875    55.5546875
  75.61132812  86.83984375 117.30957031  77.4765625   62.43164062
  69.25390625  80.81054688  39.91601562  72.20703125 106.23828125
 104.73730469 149.4375      92.68359375  59.1875      87.33203125
  46.9375      81.          82.82128906  48.99609375  58.4140625
  76.21484375  61.83190918  87.26171875 116.80078125  80.6875
  81.97070312  95.5703125   81.55273438  64.90820312  47.09082031
  76.2265625   57.91308594  75.44628906  64.61523438  70.41015625
  46.3984375   65.796875    91.04394531  83.484375    44.36132812
  81.2734375   82.41796875 133.44335938  66.46875    107.04394531
  85.96484375  68.81640625  93.36035156  44.69140625  65.91894531
  82.59765625 115

In [34]:
result_regression = pd.DataFrame(scores, index = ads_test['CustomerID'],columns=['AveMonthSpend'])

result_regression
result_regression.to_csv('result_regression.csv')