# Spending Prediction using Linear Regression and Decision Tree

In [None]:
# Importing libraries 
import numpy as np
import pandas as pd 
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.exceptions import ConvergenceWarning,DataConversionWarning
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DataConversionWarning)

In [None]:
# Loading the dataframe into jupyter notebook
df = pd.read_csv('Capital One.csv')

In [None]:
df.head()

In [None]:
# Checking number of rows and columns
df.shape

# EDA of the  Variables of the Dataset 

In [None]:
# Get the label column
label = df['SPEND_M3_TOTAL']


# Create a figure for 2 subplots (2 rows, 1 column)
fig, ax = plt.subplots(2, 1, figsize = (9,12))

# Plot the histogram   
ax[0].hist(label, bins=100)
ax[0].set_ylabel('Frequency')

# Add lines for the mean, median, and mode
ax[0].axvline(label.mean(), color='magenta', linestyle='dashed', linewidth=2)
ax[0].axvline(label.median(), color='cyan', linestyle='dashed', linewidth=2)

# Plot the boxplot   
ax[1].boxplot(label, vert=False)
ax[1].set_xlabel('Total Spend')

# Add a title to the Figure
fig.suptitle('Spend Distribution')

# Show the figure
fig.show()

In [None]:
# Plotting the count of parents and non-parents
plt.figure(figsize=(10,8))
sns.countplot("PARENT",data=df)

In [None]:
# Plotting mobile app user and non mobile app user count
plt.figure(figsize=(10,8))
sns.countplot("MOBILE_APP_USER",data=df)

In [None]:
# Plotting correlation amoung features
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), cmap = "viridis")

In [None]:
categorical_features = ['REGION', 'PARENT', 'OCCUPATION',
       'MOBILE_APP_USER', 'CARD_COLOUR']

In [None]:
# plot a boxplot for the label by each categorical feature
for col in categorical_features:
    fig = plt.figure(figsize=(9, 6))
    ax = fig.gca()
    df.boxplot(column = 'SPEND_M3_TOTAL', by = col, ax = ax)
    ax.set_title('Label by ' + col)
    ax.set_ylabel("Spend")
plt.show()

In [None]:
# checking for null values
df[df.isnull().any(axis=1)]

In [None]:
# data size before dropping null values
before_rows = df.shape[0]
print(before_rows)

In [None]:
# dropping null values
df = df.dropna()

In [None]:
# data size after dropping null values
after_rows = df.shape[0]
print(after_rows)

In [None]:
# Number of rows that were dropped
before_rows - after_rows

In [None]:
# Checking column headers
df.columns

## Declearing Reggression Featurers

In [None]:
features = ['AGE', 'PARENT','MOBILE_APP_USER', 'CREDIT_LIMIT', 'SPEND_M1_TRAVEL',
       'SPEND_M1_GROCERY', 'SPEND_M1_OTHER', 'SPEND_M2_TRAVEL',
       'SPEND_M2_GROCERY', 'SPEND_M2_OTHER',]

## Specifying Pretiction Target

In [None]:
target = ['SPEND_M3_TOTAL']

## Extract Features and Target ('SPEND_M3_TOTAL') Values into Separate Dataframes


In [None]:
X = df[features]

In [None]:
y = df[target]

In [None]:
# looking at a typical row from X
X.iloc[2]

In [None]:
# displaying targets for y
y

In [None]:
# Splitting data into training set and test test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=324)

## Fitting Linear Regression Model into Training set

In [None]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
# Performing regression using linear regression model
y_prediction = regressor.predict(X_test)
y_prediction

In [None]:
# Evaluating Linear Regression Accuracy using Root Mean Square Error
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))

In [None]:
print(RMSE)

In [None]:
import sklearn
sklearn.metrics.r2_score(y_test, y_prediction)

In [None]:
# getting feature importance
importance = regressor.coef_.flatten()

In [None]:
importance

In [None]:
X.columns

In [None]:
# Plotting feature importance of Linear regression
feature_names = X.columns

plt.figure(figsize=(12, 8))
sns.barplot(x=importance, y=feature_names)
plt.title("Linear Regression Feature Importance")
plt.xlabel("Feature Coefficient")
plt.ylabel("feature_names")
plt.show()

## Using decision tree regressor to make predictions

In [None]:
# Fitting model to the traing set
tree = DecisionTreeRegressor(max_depth=20)
tree.fit(X_train, y_train)

In [None]:
# Performing regresstion using model
z_prediction = tree.predict(X_test)
z_prediction

In [None]:
# feature importance for decition tree regressor 
tree.feature_importances_.flatten()

In [None]:
#Checking root mean square
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = z_prediction))

In [None]:
print(RMSE)

In [None]:
sklearn.metrics.r2_score(y_test, z_prediction)

# Using Pipeline and OneHotEncoder to Preprocess Categorical Data before Prediction

In [None]:
features = ['REGION','AGE', 'OCCUPATION', 'CARD_COLOUR', 'PARENT','MOBILE_APP_USER', 
        'CREDIT_LIMIT', 'SPEND_M1_TRAVEL',
       'SPEND_M1_GROCERY', 'SPEND_M1_OTHER', 'SPEND_M2_TRAVEL',
       'SPEND_M2_GROCERY', 'SPEND_M2_OTHER']
X = df[features]

In [None]:
target = ['SPEND_M3_TOTAL']
y = df[target]

In [None]:
# Splitting data into training set and test test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=324)

In [None]:
categorical_features = ['REGION', 'PARENT', 'OCCUPATION',
       'MOBILE_APP_USER', 'CARD_COLOUR']
numeric_features = ['AGE','CREDIT_LIMIT','SPEND_M1_TRAVEL',
                    'SPEND_M1_GROCERY', 'SPEND_M1_OTHER', 
                    'SPEND_M2_TRAVEL','SPEND_M2_GROCERY', 'SPEND_M2_OTHER']

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
# Define preprocessing for numeric columns (scale them)
#numeric_features = [6,7,8,9]
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# Define preprocessing for categorical features (encode them)
#categorical_features = [0,1,2,3,4,5]
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Create preprocessing and training pipeline
pipeline1 = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', GradientBoostingRegressor())])

# Create preprocessing and training pipeline
pipeline2 = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', DecisionTreeRegressor(max_depth=20))])

# Create preprocessing and training pipeline
pipeline3 = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', LinearRegression())])


# fit the pipeline to train a linear regression model on the training set
model1 = pipeline1.fit(X_train, (y_train))
print (model1) 

model2 = pipeline2.fit(X_train, (y_train))
print (model2) 

model3 = pipeline3.fit(X_train, (y_train))
print (model3) 

In [None]:
# Get predictions for GradientBoostingRegressor
predictions1 = model1.predict(X_test)

print('Moedel: GradientBoostingRegressor')
# Display Evaluation metrics
mse1 = mean_squared_error(y_test, predictions1)
print("MSE:", mse1)
rmse1 = np.sqrt(mse1)
print("RMSE:", rmse1)
Fr2 = r2_score(y_test, predictions1)
print("R2:", Fr2)

In [None]:
# Get predictions for DecisionTreeRegressor
predictions2 = model2.predict(X_test)

print('Model: DecisionTreeRegressor')
# Display Evaluation metrics
mse2 = mean_squared_error(y_test, predictions2)
print("MSE:", mse2)
rmse2 = np.sqrt(mse2)
print("RMSE:", rmse2)
Sr2 = r2_score(y_test, predictions2)
print("R2:", Sr2)

In [None]:
# Get predictions for Linear regression
predictions3 = model3.predict(X_test)

print('Model: Linear regression')
# Display Evaluation metrics
mse3 = mean_squared_error(y_test, predictions3)
print("MSE:", mse3)
rmse3 = np.sqrt(mse3)
print("RMSE:", rmse3)
Tr2 = r2_score(y_test, predictions3)
print("R2:", Tr2)

In [None]:
features = ['REGION','AGE', 'OCCUPATION', 'CARD_COLOUR', 'PARENT','MOBILE_APP_USER', 
        'CREDIT_LIMIT', 'SPEND_M1_TRAVEL',
       'SPEND_M1_GROCERY', 'SPEND_M1_OTHER', 'SPEND_M2_TRAVEL',
       'SPEND_M2_GROCERY', 'SPEND_M2_OTHER']
X = df[features]