In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
from sklearn.metrics import accuracy_score

In [None]:
# These first few steps will involve importing, checking and cleaning the data if necessary

In [None]:
testdata = pd.read_csv('/kaggle/input/job-promotion-analysis-dataset/test_data.csv')
traindata = pd.read_csv('/kaggle/input/job-promotion-analysis-dataset/train_data.csv')

traindata.head()

In [None]:
traindata.describe()

In [None]:
traindata.isnull().sum()

In [None]:
traindata.columns

In [None]:
testdata.shape

# Data Description and Exploratory Visualisations

In [None]:
traindata.head()

In [None]:
testdata.isnull().sum()

In [None]:
traindata.describe()

In [None]:
traindata.hist(figsize=(20,20))
plt.show()

In [None]:
# Here we will drop the surname dataset as it is deemed redundant
traindata.drop('Surname', axis = 1 , inplace = True)

# Encoding The Non-Numerics

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
# We import the sklearn library to gain certain processing functionality

In [None]:
traindata.columns

In [None]:
traindata.head()

In [None]:
#Convert string labels to numbers
traindata['Geography']=le.fit_transform(traindata['Geography'])
traindata['Gender']=le.fit_transform(traindata['Gender'])

testdata['Geography']=le.fit_transform(testdata['Geography'])
testdata['Gender']=le.fit_transform(testdata['Gender'])

In [None]:
traindata.head()

# Correlation

In [None]:
traindata.corr()

In [None]:
sns.heatmap(traindata.corr(), annot = True)
plt.show()
#This is not very clear so we shall use another map

In [None]:
colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(traindata.corr(),linewidths=0.1,vmax=1.0,
 square=True, cmap=colormap, linecolor='white', annot=True)
plt.show()
#We have added another map

In [None]:
# From this we can infer that most relationships are weak, 
# However a strong correlation  between the age of the customer and exited.
# To be more percise it would be better to calculate the correlations

In [None]:
# Calculating Correlations
corr = traindata.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
# Heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(corr,
 vmax=.5,
 mask=mask,
 annot=True, fmt='.2f',
 linewidths=.2, cmap="YlGnBu")
plt.show()

In [None]:
# from this let us infer,. the age to exited correlation is still the strongest.
# however it is also clear that the balance of consumers accounts has a notable relation with exited.
#Finally the weakest relations are thos of the gender. It can be infered that gender does not really play a role in those that exit.

In [None]:
# Here we finda the correlations with the target data and sort
correlations = traindata.corr()['Exited'].sort_values()
print('Most Positive Correlations: \n', correlations.tail(8))
print('\nMost Negative Correlations: \n', correlations.head(8))

In [None]:
# Estimated Salary, Geography, Balance, Age, Exited, all have the strongest positive relations to exited
# The most negative correlations to exited is that of IsActiveMember, Gender, Number of Products, Tenure and CreditScore.

# Analysis

In [None]:
traindata.columns

# Additional Column (optional)

In [None]:
# Here I added an additional column called amountspent that i was able to infere from the given data, it may not be 
# reflected in the final output and the column may be dropped

In [None]:
traindata.head()

## Trying to Discover how much is spent in a from the salary with the given balance.

In [None]:
traindata['AmountSpent']= traindata['EstimatedSalary'].astype('int') - traindata['Balance'].astype('int')
testdata['AmountSpent']= testdata['EstimatedSalary'].astype('int') - testdata['Balance'].astype('int')

In [None]:
traindata.drop('CustomerId', axis = 1 , inplace = True)
#traindata.drop('Gender', axis = 1 , inplace = True)
# Dropping customer ID as it is irrelevant

In [None]:
features = traindata[['CreditScore', 'Geography', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 
        ]]
target = traindata['Exited']


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(features,target)
print(model.feature_importances_)
feat_importance = pd.Series(model.feature_importances_,index=features.columns)
feat_importance.nlargest(10).plot(kind = 'barh')
plt.show()

In [None]:
target = traindata['Exited'].values
test = testdata[['Age', 'Balance','EstimatedSalary','Geography', 'Tenure', 'NumOfProducts', 'CreditScore', 'IsActiveMember']].values
features = traindata[['Age', 'Balance','EstimatedSalary','Geography','Tenure', 'NumOfProducts', "CreditScore", 'IsActiveMember']].values

In [None]:
# Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#features_train, features_test, target_train, target_test = train_test_split(features,target, test_size = 0.1, random_state = 0)

In [None]:
#print(features_train.shape, target_train.shape, features_test.shape, target_test.shape )

In [None]:
target_train

# Random Forest Method

In [None]:
from sklearn.ensemble import RandomForestClassifier
clasifier = RandomForestClassifier(n_estimators=200, )
model = clasifier.fit(features_train, target_train)

In [None]:
target_pred = model.predict(test)
target_pred


# Gradient Boosting Algorithm

In [None]:
# Gradient Boosting granted me the greatest accuracy of all the techniques that were used

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clasifier = GradientBoostingClassifier()
model = clasifier.fit(features_train, target_train)

In [None]:
target_pred = model.predict(test)

# Ada Boost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ad = AdaBoostClassifier()

In [None]:
target_pred = ad.fit(features_train, target_train).predict(test)

In [None]:
accuracy_score(ytest, pred)

# XG Boost

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost.sklearn import XGBRegressor

classifiers = XGBClassifier(learning_rate = 0.7, max_depth = 6, alpha = 10, n_estimators = 100)
classifiers.fit(features_train, target_train)
target_pred = classifiers.predict(test)

# Exporting

In [None]:
ext =  { 'CustomerId': testdata['CustomerId'], 'Exited': target_pred}

In [None]:
solution = pd.DataFrame(ext)
solution.head()

In [None]:
solution.to_csv('solution19.csv',index = False)