# Classification Analysis: Random Forest

org: **D4G**  project: **BGCO**  task: **random forest**

data: labeled (engage vs. static) dataset highlighting changes in member engagement (visits per week) between first and year of engagement

## Prep the environment

In [None]:
import pandas as pd  # used to manipulate dataframes
import numpy as np # used to manipulate series
import matplotlib.pyplot as plt  # plotting
import seaborn as sns  # plotting

## Data Wrangling

### Load Data

* v001 - no postal code data
* v002 - distance to clubhouse, neighborhood included
* v003 - updated distance to clubhouse, no CAMP, no distance outlier, no year1

In [None]:
# read in the dataframe
df_001 = pd.read_csv("D4G_BGCO_Engage_Labeled_ForEDA_v003.csv", encoding = "cp1252")

## Version Control

In [None]:
df = df_001
df.dropna(inplace=True)

In [None]:
print("The data set has {0} rows and {1} columns".format(df.shape[0], df.shape[1]))

In [None]:
df.head()

In [None]:
df.columns

### Organise Features

Postal code data is incomplete (~23% of the data is missing)

In [None]:
df = df.drop(['d4g_member_id'], axis =1)

categorical_features = ['member_location.x', 'sex', 'season_most']

numerical_features = ['Y1_Age', 'checkin_total', 'club_km']

target = ['label']

## Numerical Features

In [None]:
df[numerical_features].describe()

In [None]:
df[numerical_features].hist(bins=30, figsize=(10, 7))

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(25, 5))
df[df.label == "static"][numerical_features].hist(bins=30, color="blue", alpha=0.5, ax=ax)
df[df.label == "engaged"][numerical_features].hist(bins=30, color="red", alpha=0.5, ax=ax)

In [None]:
# Plot histogram of Engaged vs Distance to clubhouse
sns.distplot(df[df.label == "engaged"]["club_km"] , color="green", label="Engaged", bins = 55, kde = False)
plt.xlabel("Distance to Clubhouse (km)")
plt.ylabel("Number of Members")
plt.text(5,175, "Bars are roughly 1 km")
plt.legend()
 
plt.show()

## Categorical Features

In [None]:
ROWS, COLS = 2, 2
fig, ax = plt.subplots(ROWS, COLS, figsize=(9, 9))
row, col = 0, 0
for i, categorical_feature in enumerate(categorical_features):
    if col == COLS - 1:
        row += 1
    col = i % COLS
    df[categorical_feature].value_counts().plot('bar', ax=ax[row, col]).set_title(categorical_feature)

In [None]:
feature = "member_location.x"
fig, ax = plt.subplots(1, 2, figsize=(14, 4))
df[df.label == "static"][feature].value_counts().plot('bar', ax=ax[0]).set_title('static')
df[df.label == "engaged"][feature].value_counts().plot('bar', ax=ax[1]).set_title('engaged')

In [None]:
feature = "sex"
fig, ax = plt.subplots(1, 2, figsize=(14, 4))
df[df.label == "static"][feature].value_counts().plot('bar', ax=ax[0]).set_title('static')
df[df.label == "engaged"][feature].value_counts().plot('bar', ax=ax[1]).set_title('engaged')

In [None]:
feature = "season_most"
fig, ax = plt.subplots(1, 2, figsize=(14, 4))
df[df.label == "static"][feature].value_counts().plot('bar', ax=ax[0]).set_title('static')
df[df.label == "engaged"][feature].value_counts().plot('bar', ax=ax[1]).set_title('engaged')

## Target Feature

In [None]:
df['label'].value_counts().plot('bar').set_title('engaged')

## Prepare Data

In [None]:
df.head()

In [None]:
#import Label Encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dummy_columns = [] #array for multiple value columns

for column in df.columns:
    if df[column].dtype == object and column != 'customerID':
        if df[column].nunique() == 2:
            #apply Label Encoder for binary ones
            df[column] = le.fit_transform(df[column]) 
        else:
            dummy_columns.append(column)
            
#apply get dummies for selected columns
df = pd.get_dummies(data = df, columns = dummy_columns)

In [None]:
df.head()

In [None]:
# create feature set and labels
X = df.drop(['label'],axis=1)
y = df.label

In [None]:
from sklearn.model_selection import train_test_split  # create train, test sets

In [None]:
# train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 65)

In [None]:
# check to see that data structure is correct
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier  # random forest model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 65)

# Train the model on training data
rf.fit(X_train, y_train);

# Use the forest's predict method on the test data
y_pred = rf.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

### Visualise Results

In [None]:
# Import matplotlib for plotting and use magic command for Jupyter Notebooks
import matplotlib.pyplot as plt

%matplotlib inline  

# Set the style
plt.style.use('fivethirtyeight')  # list of x locations for plotting

In [None]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, 
                       importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

In [None]:
x_values = list(range(len(importances)))  # Make a bar chart
feature_list = list(X.columns)

plt.bar(x_values, importances, orientation = 'vertical')# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical')# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');