In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model, svm
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
import sklearn.metrics as sklm
from dateutil.relativedelta import relativedelta

%matplotlib inline

In [2]:
work_cust = pd.read_csv('work_customer.csv',index_col='CustomerID')
work_cust.drop(['Unnamed: 0','AddressLine1','FirstName','LastName','StateProvinceName',
                'PhoneNumber','BirthDate','PostalCode'], axis = 1, inplace = True)
work_cust.head().transpose()

CustomerID,11000,11001,11002,11003,11004
Title,,,,,
MiddleName,V,L,,,
Suffix,,,,,
AddressLine2,,,,,
City,Rockhampton,Seaford,Hobart,North Ryde,Wollongong
CountryRegionName,Australia,Australia,Australia,Australia,Australia
Education,Bachelors,Bachelors,Bachelors,Bachelors,Bachelors
Occupation,Professional,Professional,Professional,Professional,Professional
Gender,M,M,M,F,F
MaritalStatus,M,S,M,S,S


In [3]:
# Label
labels = np.array(work_cust['AveMonthSpend'])
labels.shape

(16404,)

In [4]:
# Encode categorical data
def encode_string(cat_features):
    
    ## First encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_features)
    enc_cat_features = enc.transform(cat_features)
    
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

categorical_columns = ['Education','Occupation',
                       'Gender','MaritalStatus','HomeOwnerFlag']

Features = encode_string(work_cust['CountryRegionName'])

for col in categorical_columns:
    temp = encode_string(work_cust[col])
    Features = np.concatenate([Features, temp], axis = 1)
    
print(Features.shape)

(16404, 22)


In [5]:
Features = np.concatenate([Features, np.array(work_cust[['NumberCarsOwned',
                                                        'NumberChildrenAtHome',
                                                        'TotalChildren',
                                                        'YearlyIncome','Age']])], axis = 1)

print(Features.shape)

(16404, 27)


In [6]:
## Randomly sample cases to create independent training and test data
nr.seed(9988)
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 0.3)
X_train = Features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
X_test = Features[indx[1],:]
y_test = np.ravel(labels[indx[1]])

In [7]:
scaler = preprocessing.StandardScaler().fit(X_train[:,23:])
X_train[:,23:] = scaler.transform(X_train[:,23:])
X_test[:,23:] = scaler.transform(X_test[:,23:])

In [8]:
lin_mod = linear_model.LinearRegression(fit_intercept = False)
lin_mod.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [9]:
ads_test = pd.read_csv('AW_test.csv')
ads_test.head()

Unnamed: 0,CustomerID,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,...,BirthDate,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome
0,18988,,Courtney,A,Baker,,8727 Buena Vista Ave.,,Fremont,California,...,1/5/1945,Bachelors,Management,F,S,0,2,0,5,86931
1,29135,,Adam,C,Allen,,3491 Cook Street,,Haney,British Columbia,...,10/4/1964,Bachelors,Skilled Manual,M,M,1,2,2,4,100125
2,12156,,Bonnie,,Raji,,359 Pleasant Hill Rd,,Burbank,California,...,1/12/1934,Graduate Degree,Management,F,M,1,2,0,4,103985
3,13749,,Julio,C,Alonso,,8945 Euclid Ave.,,Burlingame,California,...,9/22/1958,Graduate Degree,Skilled Manual,M,M,1,0,0,4,127161
4,27780,,Christy,A,Andersen,,"42, boulevard Tremblay",,Dunkerque,Nord,...,3/19/1965,High School,Manual,F,M,1,1,2,2,21876


In [10]:
# Calculate age of customer based on birthdate
def calculate_age(end):
    r = relativedelta(pd.to_datetime('now'), pd.to_datetime(end)) 
    return '{}'.format(r.years)


ads_test['Age'] = ads_test['BirthDate'].apply(calculate_age)

# Convert age to integer
ads_test['Age'] = ads_test['Age'].astype('int64')

In [12]:
ads_totest = ads_test[['CountryRegionName','Education','Occupation','Gender',
                      'MaritalStatus','HomeOwnerFlag','NumberCarsOwned',
                      'NumberChildrenAtHome','TotalChildren','YearlyIncome',
                      'Age']]

ads_totest.head().transpose()

Unnamed: 0,0,1,2,3,4
CountryRegionName,United States,Canada,United States,United States,France
Education,Bachelors,Bachelors,Graduate Degree,Graduate Degree,High School
Occupation,Management,Skilled Manual,Management,Skilled Manual,Manual
Gender,F,M,F,M,F
MaritalStatus,S,M,M,M,M
HomeOwnerFlag,0,1,1,1,1
NumberCarsOwned,2,2,2,0,1
NumberChildrenAtHome,0,2,0,0,2
TotalChildren,5,4,4,4,2
YearlyIncome,86931,100125,103985,127161,21876


In [13]:
# Encode categorical data
def encode_string(cat_features):
    
    ## First encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_features)
    enc_cat_features = enc.transform(cat_features)
    
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

categorical_columns = ['Education','Occupation',
                       'Gender','MaritalStatus','HomeOwnerFlag']

Features = encode_string(ads_totest['CountryRegionName'])

for col in categorical_columns:
    temp = encode_string(ads_totest[col])
    Features = np.concatenate([Features, temp], axis = 1)
    
print(Features.shape)

(500, 22)


In [14]:
# Add numerical feature
Features = np.concatenate([Features, np.array(ads_totest[['NumberCarsOwned',
                                                        'NumberChildrenAtHome',
                                                        'TotalChildren',
                                                        'YearlyIncome','Age']])], axis = 1)

print(Features.shape)

(500, 27)


In [15]:
X_test = Features
X_test[:,23:] = scaler.transform(X_test[:,23:])

In [16]:
scores = lin_mod.predict(X_test)
print(scores)

[ 42.84375    106.94726562  49.07910156  88.77539062  60.90332031
  43.23339844  95.91796875 127.38671875 103.19238281  56.57617188
  59.53613281  51.17480469  72.73535156  47.00390625  37.99560547
  51.5390625   86.01171875  73.24023438 112.3203125   59.671875
  68.82275391  76.0859375  149.71289062  85.10351562  55.46875
  75.42724609  86.77148438 117.36181641  77.5546875   62.21191406
  69.2109375   80.79638672  39.81787109  72.02441406 106.24200439
 104.6875     149.4609375   92.6015625   59.23046875  87.265625
  46.88867188  80.78759766  82.74804688  48.79296875  58.4375
  76.28662109  61.75683594  87.24023438 116.72802734  80.70898438
  82.01367188  95.59570312  81.47021484  64.99609375  47.02636719
  76.17382812  57.81982422  75.42724609  64.68066406  70.47644043
  46.4453125   65.69921875  91.07617188  83.54980469  44.421875
  81.09375     82.46484375 133.4609375   66.52148438 106.82666016
  86.          68.72753906  93.29248047  44.75097656  65.98583984
  82.54345703 115.86816

In [17]:
result_regression = pd.DataFrame(scores, index = ads_test['CustomerID'],columns=['AveMonthSpend'])

result_regression
result_regression.to_csv('ResultRegression.csv')