# Classification Analysis

org: **D4G**  project: **BGCO**  task: **exploratory data analysis**

data: labeled (engage vs. static) dataset highlighting changes in member engagement (visits per week) between first and year of engagement

## Prep the environment

In [None]:
import pandas as pd  # used to manipulate dataframes
import numpy as np # used to manipulate series

import seaborn as sns  # needed for visualing
import matplotlib.pyplot as plt  # needed for visualing

## Data Wrangling

### Load Data

* v001 - no postal code data
* v002 - distance to clubhouse, neighborhood included
* v033 - updated distance to clubhouse, no CAMP, no distance outlier, no year1

In [None]:
# read in the dataframe
df_001 = pd.read_csv("D4G_BGCO_Engage_Labeled_ForEDA_v003.csv", encoding = "cp1252")

## Version Control

In [None]:
df = df_001
df.dropna(inplace=True)

In [None]:
print("The data set has {0} rows and {1} columns".format(df.shape[0], df.shape[1]))

In [None]:
df.head()

In [None]:
df.columns

### Organise Features

Postal code data is incomplete (~23% of the data is missing)

In [None]:
df = df.drop(['d4g_member_id'], axis =1)

categorical_features = ['member_location.x', 'sex', 'season_most']

numerical_features = ['Y1_Age', 'checkin_total', 'club_km']

target = ['label']

## Numerical Features

In [None]:
df[numerical_features].describe()

In [None]:
df[numerical_features].hist(bins=30, figsize=(10, 7))

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(25, 5))
df[df.label == "static"][numerical_features].hist(bins=30, color="blue", alpha=0.5, ax=ax)
df[df.label == "engaged"][numerical_features].hist(bins=30, color="red", alpha=0.5, ax=ax)

## Categorical Features

In [None]:
ROWS, COLS = 2, 2
fig, ax = plt.subplots(ROWS, COLS, figsize=(9, 9))
row, col = 0, 0
for i, categorical_feature in enumerate(categorical_features):
    if col == COLS - 1:
        row += 1
    col = i % COLS
    df[categorical_feature].value_counts().plot('bar', ax=ax[row, col]).set_title(categorical_feature)

In [None]:
feature = "member_location.x"
fig, ax = plt.subplots(1, 2, figsize=(14, 4))
df[df.label == "static"][feature].value_counts().plot('bar', ax=ax[0]).set_title('static')
df[df.label == "engaged"][feature].value_counts().plot('bar', ax=ax[1]).set_title('engaged')

In [None]:
feature = "sex"
fig, ax = plt.subplots(1, 2, figsize=(14, 4))
df[df.label == "static"][feature].value_counts().plot('bar', ax=ax[0]).set_title('static')
df[df.label == "engaged"][feature].value_counts().plot('bar', ax=ax[1]).set_title('engaged')

In [None]:
feature = "season_most"
fig, ax = plt.subplots(1, 2, figsize=(14, 4))
df[df.label == "static"][feature].value_counts().plot('bar', ax=ax[0]).set_title('static')
df[df.label == "engaged"][feature].value_counts().plot('bar', ax=ax[1]).set_title('engaged')

## Target Feature

In [None]:
df['label'].value_counts().plot('bar').set_title('engaged')

## Encode Data

In [None]:
#import Label Encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dummy_columns = [] #array for multiple value columns

for column in df.columns:
    if df[column].dtype == object and column != 'customerID':
        if df[column].nunique() == 2:
            #apply Label Encoder for binary ones
            df[column] = le.fit_transform(df[column]) 
        else:
            dummy_columns.append(column)
            
#apply get dummies for selected columns
df = pd.get_dummies(data = df, columns = dummy_columns)

In [None]:
df.head()

## Binary Classication Model with XGBoost

In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# create feature set and labels
X = df.drop(['label'],axis=1)
y = df.label

# train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=65)

In [None]:
# check to see that data structure is correct
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
#building the model & printing the score
xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.08, objective= 'binary:logistic',n_jobs=-1).fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
from xgboost import plot_importance
fig, ax = plt.subplots(figsize=(10,8))
plot_importance(xgb_model, ax=ax)