In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [19]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

In [20]:
train_target = train['Segmentation']
train = train.drop(['Segmentation'], axis=1)

In [21]:
print(train.shape)
print(test.shape)

(8068, 10)
(2627, 10)


In [22]:
train.head(2)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4


In [23]:
train.dtypes

ID                   int64
Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size        float64
Var_1               object
dtype: object

In [24]:
train.nunique()

ID                 8068
Gender                2
Ever_Married          2
Age                  67
Graduated             2
Profession            9
Work_Experience      15
Spending_Score        3
Family_Size           9
Var_1                 7
dtype: int64

In [25]:
train.Var_1.fillna('Cat_10', inplace=True)
train.Var_1 = train['Var_1'].apply(lambda x:int(str(x).split('_')[1]))

test.Var_1.fillna('Cat_10', inplace=True)
test.Var_1 = test['Var_1'].apply(lambda x:int(str(x).split('_')[1]))

In [26]:
Profession = {
    'Healthcare':0,
    'Engineer':1,
    'Lawyer':2,
    'Entertainment':3,
    'Artist':4,
    'Executive':5,
    'Doctor':6,
    'Homemaker':7,
    'Marketing':8,
    np.nan:10
}
Gender = {
    'Male':0,
    'Female':1,
    np.nan:10
}
Ever_Married = {
    'Yes':0,
    'No':1,
    np.nan:10
}
Graduated = {
    'Yes':0,
    'No':1,
    np.nan:10
}
Spending_Score = {
    'Low':0,
    'Average':1,
    'High':2,
    np.nan:10
}
to_change = [Gender, Ever_Married, Graduated, Spending_Score, Profession] 

for i in to_change:
  train = train.replace(i)
  test = test.replace(i)

In [27]:
train.nunique()

ID                 8068
Gender                2
Ever_Married          3
Age                  67
Graduated             3
Profession           10
Work_Experience      15
Spending_Score        3
Family_Size          10
Var_1                 8
dtype: int64

In [28]:
train.dtypes

ID                   int64
Gender               int64
Ever_Married         int64
Age                  int64
Graduated            int64
Profession           int64
Work_Experience    float64
Spending_Score       int64
Family_Size        float64
Var_1                int64
dtype: object

In [29]:
# train_target = train_target.replace(
#     {
#         'A':0,
#         'B':1,
#         'C':2,
#         'D':3
#     }
# )

# Missing Value Analysis

In [30]:
train.isna().sum()

ID                 0
Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
Var_1              0
dtype: int64

In [31]:
test.isna().sum()

ID                 0
Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
Var_1              0
dtype: int64

In [32]:
train = train.replace({10:np.nan})
test = test.replace({10:np.nan})

In [33]:
from fancyimpute import KNN
train = pd.DataFrame(KNN(k = 3).fit_transform(train), columns = test.columns)
test = pd.DataFrame(KNN(k = 3).fit_transform(test), columns = test.columns)

Imputing row 1/8068 with 0 missing, elapsed time: 9.766
Imputing row 101/8068 with 0 missing, elapsed time: 9.768
Imputing row 201/8068 with 0 missing, elapsed time: 9.769
Imputing row 301/8068 with 1 missing, elapsed time: 9.770
Imputing row 401/8068 with 0 missing, elapsed time: 9.772
Imputing row 501/8068 with 0 missing, elapsed time: 9.773
Imputing row 601/8068 with 0 missing, elapsed time: 9.774
Imputing row 701/8068 with 0 missing, elapsed time: 9.776
Imputing row 801/8068 with 0 missing, elapsed time: 9.777
Imputing row 901/8068 with 0 missing, elapsed time: 9.779
Imputing row 1001/8068 with 0 missing, elapsed time: 9.780
Imputing row 1101/8068 with 0 missing, elapsed time: 9.782
Imputing row 1201/8068 with 0 missing, elapsed time: 9.782
Imputing row 1301/8068 with 0 missing, elapsed time: 9.783
Imputing row 1401/8068 with 1 missing, elapsed time: 9.785
Imputing row 1501/8068 with 0 missing, elapsed time: 9.786
Imputing row 1601/8068 with 1 missing, elapsed time: 9.787
Imputing 

In [34]:
train.nunique()

ID                 8068
Gender                2
Ever_Married         84
Age                  67
Graduated            56
Profession          127
Work_Experience     869
Spending_Score        3
Family_Size         341
Var_1                73
dtype: int64

In [35]:
for i in train.columns:
  train[i] = train[i].apply(lambda x: int(round(x)))
  test[i] = test[i].apply(lambda x: int(round(x)))

In [36]:
train.dtypes

ID                 int64
Gender             int64
Ever_Married       int64
Age                int64
Graduated          int64
Profession         int64
Work_Experience    int64
Spending_Score     int64
Family_Size        int64
Var_1              int64
dtype: object

# Model

### Training Data

In [37]:
train.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,462809,0,1,22,1,0,1,0,4,4
1,462643,1,0,38,0,1,5,1,3,4
2,466315,1,0,67,0,1,1,0,1,6
3,461735,0,0,67,0,2,0,2,2,6
4,462669,1,0,40,0,3,4,2,6,6


In [38]:
train = pd.concat([train, train_target], axis=1)
train.shape

(8068, 11)

In [39]:
# train.to_csv('train.csv',index=False)
# test.to_csv('test.csv', index=False)

# Analysis

In [40]:
train['Segmentation'].value_counts()

D    2268
A    1972
C    1970
B    1858
Name: Segmentation, dtype: int64

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [42]:
X = train
X.shape

(8068, 11)

In [43]:
y = X['Segmentation']
X = X.drop(['ID', 'Segmentation'], axis=1)
# X = X[['Age', 'Graduated', 'Profession', 'Spending_Score']]

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## XGB

In [45]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from xgboost.sklearn import XGBClassifier

In [46]:
grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
kf = KFold(n_splits=2)

gs = GridSearchCV(estimator = XGBClassifier(n_estimators=500), param_grid = grid, scoring='accuracy',n_jobs=4, cv=kf)

In [None]:
gs.fit(X_train, y_train)

y_pred = gs.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
gs.best_estimator_

In [None]:
xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=2,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.8, verbosity=1)

In [None]:
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
X_train.columns

In [None]:
xgb.feature_importances_

## SVM

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf_svm = RandomForestClassifier(
    n_estimators=1000
)

clf_svm.fit(X_train, y_train)
y_pred = clf_svm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(
    max_depth=2, 
    random_state=0,
    n_estimators=500,
    
)

clf_rf.fit(X_train, y_train)
y_pred = clf_rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Deep Learning Model

In [None]:
# from tensorflow.keras.layers import Dense, Dropout
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.callbacks import EarlyStopping

# callback = EarlyStopping(
#     monitor='val_accuracy',
#     patience=5
# )

In [None]:
# model = Sequential([
#                        Dense(64,input_shape=(None, 4), activation='relu'),
#                        Dense(128, activation='relu'),
#                        Dense(64, activation='relu'),
#                        Dense(4, activation='softmax'),
# ])
# model.compile(
#     loss='sparse_categorical_crossentropy',
#     optimizer='adam',
#     metrics=['accuracy']
# )

In [None]:
# X_train.shape

In [None]:
# model.summary()

In [None]:
# model.fit(X_train,y_train,
#           validation_data=(X_test, y_test),
#           epochs=100,
#           callbacks=[callback],
#           verbose=1
# )

# Final

In [None]:
xgb.fit(X, y)

predictions = xgb.predict(test.drop(['ID'], axis=1))

In [None]:
submission = pd.DataFrame({
    'ID':test['ID'],
    'Segmentation':predictions
    })

In [None]:
submission.reset_index(inplace=True, drop=True)

In [None]:
submission.to_csv('eighth.csv',index=False)

In [None]:
df = pd.read_csv('eighth.csv')
df.head()

In [None]:
# submission.to_csv('/content/drive/My Drive/Data Hack/Customer Segmentation/sixth.csv', index=False)