In [1]:
import pandas as pd
import numpy as np

In [2]:
#loading the data into the pandas dataframe
df = pd.read_csv("ANZ.csv")
df.head() #displaying the first few rows of our data

# ACCESS

In [None]:
df.info() #extracting the concise summary of the data 

#to check for nullvalues; print(ANZ.isnull().sum())

In a real world data situation, some of the unselected features data here might be useful in consideration for the ananlysis:
1. like long_lat, to describe or pinpoint which exact place the transaction was made.
2. the merchant_suburb, merchant_long_lat and state to easily track merchants in the case of transactional discrepancies
3. extraction, to check trends in a time series analysis, check for example which time of the day customers bought the most
4. hmm i also think movement will be useful to know the rate at which accounts are debited and credited.

In [None]:
df.describe()

Notes
* Fix date data type change data data type from oject to date data type
* Encode Gender data to ones and zeros. [one hot encoding]

# CLEAN

#### DEFINE

* change date data type to datetime data type
* check if its possible to assign the customer id column to the first name column
* use fewer and more important columns [feature selection]

#### CODE

In [None]:
df_clean = pd.read_csv('ANZ.csv', usecols=[0,6,9,10,11,12,13,14,17,20])
#Feature selection, selecting important features

In [None]:
df_clean.head()

In [None]:
#checking for duplicated rows/observations
sum(df_clean.duplicated())

In [None]:
sum(df.duplicated()) #the original data has no duplicates too.

In [None]:
df_clean.info()

In [None]:
# fixing incorrect data type (date)
df_clean['date'] = pd.to_datetime(df_clean['date'])

In [None]:
df_clean.info() #checking the summary of the data to see if the date's datatype has changed

In [10]:
df_clean.gender= pd.get_dummies(df_clean.gender) 
#encoding the gender feature into ones and zeros
# males are zeros and females are ones.

In [None]:
df_clean.head()

In [None]:
df_clean.info()

In [None]:
#to view data plots with the hist() function in pandas

%matplotlib inline
df_clean.hist();

* here we see that there are over 6k transactions of the 12043 transactions are from males and about 5k+ are females
* The customers with the highest percentage is within the 19-40 age range.

#### Calculations
* Calculate and Analyze the annual Salary for each customer
* Explore the correlations between the annual salary and features such as age, gender, and other purchasing behaviour
* Visualize correlations in a scatterplot
* after finding correlations, build a simple regression model that uses these features above to predict annual salary
* Build a decision tree based model to predict salaries.
* How accurate are the models? and what would you do to accurately test their performance?

In [None]:
#groupby trial 1
df_clean.groupby(['first_name', 'txn_description'])['amount'].sum()

In [None]:
#trial 2 that produced the result well
df_clean[df_clean['txn_description']== 'PAY/SALARY'].groupby(['first_name'])['amount'].sum()

# anywhere the txn is pay/salary, print the first name and sum of amount.

In [None]:
pd.options.display.max_rows = 4000
#to remove the ellipses in the data, to view the full data of the 100 customers in  display

In [None]:
#after trial 2, generating useful data with more features
df_clean[df_clean['txn_description']=='PAY/SALARY'].groupby(['customer_id'])['amount'].sum()

In [None]:
x= df_clean[df_clean['txn_description']=='PAY/SALARY'].groupby('age')['amount'].sum()

In [None]:
x.sort_values(ascending = False)
#The 40 year olds are paid more than everyone else in their 
#customer list

* I first took the groupby method of the data frame itself without filtering by salary, and the ages with the top highest txns were 40, 21, 19, 38, etc
Now filtering by salary, the 19 year olds actually earn more than the 21 year olds but perform less transactions than them.

In [None]:
df_clean.groupby('first_name').count()

In [4]:
df_cleaned = pd.read_csv('quarterly.csv')

In [None]:
df_cleaned.head()

In [5]:
df_cleaned['annual'] = df_cleaned['quarterly'] *4

In [6]:
df_cleaned.head()

Unnamed: 0,customer_id,quarterly,annual
0,CUS-1005756958,12616.11,50464.44
1,CUS-1117979751,25050.55,100202.2
2,CUS-1140341822,11499.06,45996.24
3,CUS-1147642491,22248.07,88992.28
4,CUS-1196156254,27326.11,109304.44


* to setindex.

df_clean.set_index('customer_id',inplace=True)
df_clean

* correlations between annual salary and purchasing behaviour e.g. age, gender etc.

In [7]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn; seaborn.set()

In [6]:
df.drop(['status','date','merchant_suburb','balance'], axis=1, inplace=True)

In [11]:
#join the tables created with the pd.merge() function
data =pd.merge(df,df_cleaned, on = 'customer_id')

In [11]:
data.head()

Unnamed: 0,card_present_flag,bpay_biller_code,account,currency,long_lat,txn_description,merchant_id,merchant_code,first_name,gender,...,merchant_state,extraction,amount,transaction_id,country,customer_id,merchant_long_lat,movement,quarterly,annual
0,1.0,,ACC-1598451071,AUD,153.41 -27.95,POS,81c48296-73be-44a7-befa-d053f48ce7cd,,Diana,1,...,QLD,2018-08-01T01:01:15.000+0000,16.25,a623070bfead4541a6b0fff8a09e706c,Australia,CUS-2487424745,153.38 -27.99,debit,14191.38,56765.52
1,0.0,,ACC-1598451071,AUD,153.41 -27.95,SALES-POS,830a451c-316e-4a6a-bf25-e37caedca49e,,Diana,1,...,NSW,2018-08-01T01:13:45.000+0000,14.19,13270a2a902145da9db4c951e04b51b9,Australia,CUS-2487424745,151.21 -33.87,debit,14191.38,56765.52
2,1.0,,ACC-1598451071,AUD,153.41 -27.95,SALES-POS,b4e02c10-0852-4273-b8fd-7b3395e32eb0,,Diana,1,...,QLD,2018-08-01T01:51:15.000+0000,3.25,329adf79878c4cf0aeb4188b4691c266,Australia,CUS-2487424745,153.44 -28.06,debit,14191.38,56765.52
3,1.0,,ACC-1598451071,AUD,153.41 -27.95,POS,f2ef6270-cf91-409f-a6a2-fbd6735ea500,,Diana,1,...,NSW,2018-08-01T08:09:42.000+0000,14.1,1c12c9ad77894ef8b507fb091e41e928,Australia,CUS-2487424745,153.6 -28.63,debit,14191.38,56765.52
4,,0.0,ACC-1598451071,AUD,153.41 -27.95,PAY/SALARY,,0.0,Diana,1,...,,2018-08-01T14:00:00.000+0000,1013.67,a72c9dd8b4614a1982ff71f8b62677ad,Australia,CUS-2487424745,,credit,14191.38,56765.52


In [12]:
data.drop(['card_present_flag','bpay_biller_code','currency','merchant_id','merchant_code','merchant_state','extraction','merchant_long_lat','transaction_id','quarterly'], axis=1, inplace=True)

In [13]:
categories =['customer_id','txn_description','movement']
data= pd.get_dummies(data, columns=categories, drop_first=True)

In [13]:
data.head()

Unnamed: 0,account,long_lat,txn_description,first_name,gender,age,amount,country,customer_id,movement,annual
0,ACC-1598451071,153.41 -27.95,POS,Diana,1,26,16.25,Australia,CUS-2487424745,debit,56765.52
1,ACC-1598451071,153.41 -27.95,SALES-POS,Diana,1,26,14.19,Australia,CUS-2487424745,debit,56765.52
2,ACC-1598451071,153.41 -27.95,SALES-POS,Diana,1,26,3.25,Australia,CUS-2487424745,debit,56765.52
3,ACC-1598451071,153.41 -27.95,POS,Diana,1,26,14.1,Australia,CUS-2487424745,debit,56765.52
4,ACC-1598451071,153.41 -27.95,PAY/SALARY,Diana,1,26,1013.67,Australia,CUS-2487424745,credit,56765.52


In [None]:
data.head()

In [None]:
import seaborn as sns

In [14]:
data.corr()

Unnamed: 0,gender,age,amount,annual
gender,1.0,-0.011584,-0.031362,-0.054198
age,-0.011584,1.0,0.02998,0.026425
amount,-0.031362,0.02998,1.0,0.091213
annual,-0.054198,0.026425,0.091213,1.0


In [None]:
data.head()

fig, ax = plt.subplots(figsize=(12,12))
ax= seaborn.heatmap(data.corr(), annot= True)

In [None]:
plt.style.use('seaborn-whitegrid')
plt.plot(data['age'], data['annual'], 'ok');

* our output shows that the highest earners are males and are btw 30 - 45 years
* this plot also shows a non-linear relationship between the independent variable (age) and the dependent variable (annual)

In [None]:
plt.plot(data['gender'], data['annual'], 'o', color = 'black');
#here it shows that the men earn more than the women annually.

#I'm not sure if this plot shows a linear or non-linear realtionship between the gender and salary variables.

In [32]:
data.head()

Unnamed: 0,gender,age,amount,annual,movement_debit
0,1,26,16.25,56765.52,1
1,1,26,14.19,56765.52,1
2,1,26,3.25,56765.52,1
3,1,26,14.1,56765.52,1
4,1,26,1013.67,56765.52,0


In [18]:
data.iloc[:,10]

0         56765.52
1         56765.52
2         56765.52
3         56765.52
4         56765.52
           ...    
12038    123348.40
12039    123348.40
12040    123348.40
12041    123348.40
12042    123348.40
Name: annual, Length: 12043, dtype: float64

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X = data.iloc[:, :3]
y = data.iloc[:, -2]

X_train, X_test, y_train, y_test= train_test_split(X, y, random_state = 42, test_size = 0.23)

In [25]:
data.drop(['account','txn_description','customer_id','long_lat'], axis=1, inplace=True)

categories =['country','movement']
data= pd.get_dummies(data, columns=categories, drop_first=True)

KeyError: "['account' 'txn_description' 'customer_id' 'long_lat'] not found in axis"

In [30]:
data.drop(['first_name'],1, inplace=True)

In [34]:
lrm = LinearRegression()
lrm.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [35]:
def evaluate_model(model, X_test, y_test):
    score = model.score(X_test, y_test)
    
    return score

In [None]:
y_pred= lrm.predict(X_test)

In [36]:
# Let's check how well our model perform with the test set
score = evaluate_model(lrm, X_test, y_test)
score

0.012938179336601243

In [37]:
#accuracy
print('Accuracy on Testing set: %.1f' %(lrm.score(X_test,y_test)*100))

Accuracy on Testing set: 1.3
