<h2>Data preprocessing and Machine Learning</h2>

<h3>Importing moduls for first look at the data</h3>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('marketing_campaign.csv', delimiter=';')
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,5,0,0,0,0,0,0,3,11,0


<h3>Dropping not usable data</h3>

In [4]:
df.drop(['ID', 'Z_CostContact', 'Z_Revenue'], axis=1, inplace=True)

ID column Doesn't give us any result as it's just a data counter 
Z_CostContact and Z_Revenue we delete because this data have only 1 unique value

In [5]:
df['Start_Year'] = pd.DatetimeIndex(df['Dt_Customer']).year - df['Year_Birth']
df.head()

Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,Start_Year
0,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,88,...,4,7,0,0,0,0,0,0,1,55
1,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,1,...,2,5,0,0,0,0,0,0,0,60
2,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,49,...,10,4,0,0,0,0,0,0,0,48
3,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,4,...,4,6,0,0,0,0,0,0,0,30
4,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,43,...,6,5,0,0,0,0,0,0,0,33


In [6]:
df.drop(['Year_Birth', 'Dt_Customer'], axis=1, inplace=True)

After receiving the column for how many years the client registered, I can safely delete both columns that I indicated above. 

In [7]:
categorical = [
    'Education',
    'Marital_Status',
    'Kidhome',
    'Teenhome',
    'AcceptedCmp3',
    'AcceptedCmp4',
    'AcceptedCmp5',
    'AcceptedCmp1',
    'AcceptedCmp2',
    'Complain'
]

numerical = [
    'Start_Year',
    'Income',
    'Recency',
    'MntWines',
    'MntFruits',
    'MntMeatProducts',
    'MntFishProducts',
    'MntSweetProducts',
    'MntGoldProds',
    'NumDealsPurchases',
    'NumWebPurchases',
    'NumCatalogPurchases',
    'NumStorePurchases',
    'NumWebVisitsMonth'
]

In [8]:
df.dropna(subset=['Income'], inplace=True)

And now let's remove 24 missing data from the Income column, I think this is a rather small number for our dataset of 2240 in size. 

<h3>Encoding data</h3>

In [9]:
df = pd.get_dummies(df, columns=categorical)
df.head()

Unnamed: 0,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,...,AcceptedCmp4_0,AcceptedCmp4_1,AcceptedCmp5_0,AcceptedCmp5_1,AcceptedCmp1_0,AcceptedCmp1_1,AcceptedCmp2_0,AcceptedCmp2_1,Complain_0,Complain_1
0,58138.0,58,635,88,546,172,88,88,3,8,...,1,0,1,0,1,0,1,0,1,0
1,46344.0,38,11,1,6,2,1,6,2,1,...,1,0,1,0,1,0,1,0,1,0
2,71613.0,26,426,49,127,111,21,42,1,8,...,1,0,1,0,1,0,1,0,1,0
3,26646.0,26,11,4,20,10,3,5,2,2,...,1,0,1,0,1,0,1,0,1,0
4,58293.0,94,173,43,118,46,27,15,5,5,...,1,0,1,0,1,0,1,0,1,0


In [10]:
from sklearn.preprocessing import StandardScaler
ss= StandardScaler()
ss.fit(df[numerical])
df[numerical] = ss.transform(df[numerical])

### Machine Learning for Classification

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [12]:
X=df.drop('Response',axis=1)
y=df['Response']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [15]:
from sklearn.model_selection import GridSearchCV

<h3>KNeighbors Classifier

In [14]:
from sklearn.neighbors import KNeighborsClassifier

In [16]:
knn = KNeighborsClassifier()

In [28]:
parameters={
"n_neighbors":[3, 5, 7, 9, 11, 13],
"weights" : ['uniform', 'distance'],
"algorithm":['ball_tree', 'kd_tree', 'brute']
}

In [36]:
knn = KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='auto')
knn.fit(X_train, y_train)
y_preds = knn.predict(X_test)
target_names = ['yes', 'no']
print(classification_report(y_test, y_preds, target_names=target_names))
print('--------------------------------------')

              precision    recall  f1-score   support

         yes       0.88      0.96      0.92       565
          no       0.52      0.27      0.36       100

    accuracy                           0.85       665
   macro avg       0.70      0.61      0.64       665
weighted avg       0.83      0.85      0.83       665

--------------------------------------


In [33]:
tuning_model=GridSearchCV(knn, param_grid=parameters, scoring='accuracy', cv=5)
tuning_model.fit(X_train, y_train)
print('Is tuned')
print(tuning_model.best_params_)

Is tuned
{'algorithm': 'ball_tree', 'n_neighbors': 7, 'weights': 'distance'}


In [34]:
knn = KNeighborsClassifier(n_neighbors=7, weights='distance', algorithm='ball_tree')
knn.fit(X_train, y_train)
y_preds = knn.predict(X_test)
target_names = ['yes', 'no']
print(classification_report(y_test, y_preds, target_names=target_names))
print('--------------------------------------')

              precision    recall  f1-score   support

         yes       0.87      0.98      0.92       565
          no       0.55      0.17      0.26       100

    accuracy                           0.85       665
   macro avg       0.71      0.57      0.59       665
weighted avg       0.82      0.85      0.82       665

--------------------------------------


### Support Vector Machine

In [37]:
from sklearn.svm import SVC

In [38]:
svc = SVC()

In [39]:
C_value = np.arange(1, 3, 0.2)
C_value = C_value.tolist()
print(C_value)

[1.0, 1.2, 1.4, 1.5999999999999999, 1.7999999999999998, 1.9999999999999998, 2.1999999999999997, 2.3999999999999995, 2.5999999999999996, 2.8]


In [40]:
parameters={
'kernel': ['poly', 'rbf', 'sigmoid'],
'C': C_value,
'gamma': [1, 0.1, 0.01, 0.001]
}

In [48]:
svc = SVC(kernel='rbf', C=1.9999999999999998)
svc.fit(X_train, y_train)
y_preds = svc.predict(X_test)
target_names = ['yes', 'no']
print(classification_report(y_test, y_preds, target_names=target_names))

              precision    recall  f1-score   support

         yes       0.90      0.97      0.93       565
          no       0.70      0.37      0.48       100

    accuracy                           0.88       665
   macro avg       0.80      0.67      0.71       665
weighted avg       0.87      0.88      0.87       665



In [49]:
tuning_model=GridSearchCV(svc, param_grid=parameters, scoring='accuracy', cv=5)
tuning_model.fit(X_train, y_train)
print('Is tuned')
print(tuning_model.best_params_)

Is tuned
{'C': 1.7999999999999998, 'gamma': 0.1, 'kernel': 'rbf'}


In [50]:
svc = SVC(kernel='rbf', C=1.7999999999999998, gamma=0.1)
svc.fit(X_train, y_train)
y_preds = svc.predict(X_test)
target_names = ['yes', 'no']
print(classification_report(y_test, y_preds, target_names=target_names))

              precision    recall  f1-score   support

         yes       0.89      0.97      0.93       565
          no       0.63      0.31      0.42       100

    accuracy                           0.87       665
   macro avg       0.76      0.64      0.67       665
weighted avg       0.85      0.87      0.85       665



### Decision Tree

In [51]:
from sklearn.tree import DecisionTreeClassifier

In [52]:
tree = DecisionTreeClassifier()

In [61]:
parameters={
'criterion': ['gini', 'entropy'],
'splitter': ['best', 'random'],
'max_depth': [6, 7, 8, 9, 10, 11, 12, 13],
'max_features': ['auto', 'sqrt', 'log2'],
}

In [67]:
tuning_model=GridSearchCV(DecisionTreeClassifier(), param_grid=parameters, scoring='accuracy', cv=5)
tuning_model.fit(X_train, y_train)
print('Is tuned')
print(tuning_model.best_params_)

Is tuned
{'criterion': 'entropy', 'max_depth': 6, 'max_features': 'sqrt', 'splitter': 'best'}


In [69]:
tree = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=12, max_features='auto')
tree.fit(X_train, y_train)
y_preds = tree.predict(X_test)
target_names = ['yes', 'no']
print(classification_report(y_test, y_preds, target_names=target_names))

              precision    recall  f1-score   support

         yes       0.89      0.93      0.91       565
          no       0.47      0.37      0.41       100

    accuracy                           0.84       665
   macro avg       0.68      0.65      0.66       665
weighted avg       0.83      0.84      0.83       665



In [68]:
tree = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=6, max_features='sqrt')
tree.fit(X_train, y_train)
y_preds = tree.predict(X_test)
target_names = ['yes', 'no']
print(classification_report(y_test, y_preds, target_names=target_names))

              precision    recall  f1-score   support

         yes       0.89      0.97      0.93       565
          no       0.66      0.29      0.40       100

    accuracy                           0.87       665
   macro avg       0.77      0.63      0.67       665
weighted avg       0.85      0.87      0.85       665



### Logistic Regression

In [70]:
from sklearn.linear_model import LogisticRegression

In [71]:
lgr = LogisticRegression()

In [72]:
parameters={"penalty":["l1","l2", "elasticnet"],
            "dual" : [True, False],
            "fit_intercept":[True, False],
            "C": C_value,
            "solver":["newton-cg","lbfgs","liblinear", "sag", "saga"]
}

In [None]:
tuning_model=GridSearchCV(lgr, param_grid=parameters, scoring='accuracy', cv=5)
tuning_model.fit(X_train, y_train)
print('Is tuned')
print(tuning_model.best_params_)

Is tuned
{'C': 2.8, 'dual': True, 'fit_intercept': False, 'penalty': 'l2', 'solver': 'liblinear'}

In [74]:
lrg = LogisticRegression(C=2.8, dual=True, fit_intercept=False, penalty='l2', solver='liblinear')
lrg.fit(X_train, y_train)
y_preds = lrg.predict(X_test)
target_names = ['yes', 'no']
print(classification_report(y_test, y_preds, target_names=target_names))

              precision    recall  f1-score   support

         yes       0.91      0.98      0.94       565
          no       0.79      0.48      0.60       100

    accuracy                           0.90       665
   macro avg       0.85      0.73      0.77       665
weighted avg       0.89      0.90      0.89       665





### Machine Learning for Regression

In [76]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [77]:
X=df.drop('Income',axis=1)
y=df['Income']

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

### KNeighbors Regressor

In [80]:
from sklearn.neighbors import KNeighborsRegressor

In [81]:
knn = KNeighborsRegressor()

In [82]:
parameters={
"n_neighbors":[3, 5, 7, 11, 13],
"weights" : ['uniform', 'distance'],
"algorithm":['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [None]:
tuning_model=GridSearchCV(knn, param_grid=parameters, scoring=mean_squared_error, cv=5)
tuning_model.fit(X_train, y_train)
print('Is tuned')
print(tuning_model.best_params_)

In [84]:
knn = KNeighborsRegressor(n_neighbors=3, algorithm='auto', weights='uniform')
knn.fit(X_train, y_train)
y_preds = knn.predict(X_test)
score=knn.score(X_test, y_test)
print(score)
mse = mean_squared_error(y_test, y_preds)
print("Mean Squared Error:",mse)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

0.3210832538781869
Mean Squared Error: 1.1190400065802022
Root Mean Squared Error: 1.0578468729358717


### Support vector Regressor

In [88]:
from sklearn.svm import SVR

In [89]:
svr = SVR()

In [90]:
parameters={
'kernel': ['poly', 'rbf', 'sigmoid'],
'C': C_value,
'gamma': [1, 0.1, 0.01, 0.001]
}

In [None]:
tuning_model=GridSearchCV(svr, param_grid=parameters, scoring=mean_squared_error, cv=5)
tuning_model.fit(X_train, y_train)
print('Is tuned')
print(tuning_model.best_params_)

Is tuned
{'C': 1.0, 'gamma': 1, 'kernel': 'poly'}

In [92]:
svr = SVR()
svr.fit(X_train, y_train)
y_preds = svr.predict(X_test)
score=svr.score(X_test, y_test)
print(score)
mse = mean_squared_error(y_test, y_preds)
print("Mean Squared Error:", mse)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

0.3224652798584875
Mean Squared Error: 1.116762050746996
Root Mean Squared Error: 1.0567696299321796


In [93]:
svr = SVR(C=1.0, gamma=1, kernel='poly')
svr.fit(X_train, y_train)
y_preds = svr.predict(X_test)
score=svr.score(X_test, y_test)
print(score)
mse = mean_squared_error(y_test, y_preds)
print("Mean Squared Error:", mse)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

-1.3276301884029085
Mean Squared Error: 3.836569529659714
Root Mean Squared Error: 1.958716296368546


### DecisionTree Regressor

In [94]:
from sklearn.tree import DecisionTreeRegressor

In [95]:
tree = DecisionTreeRegressor()

In [96]:
parameters = {
    'criterion': ['friedman_mse', 'absolute_error', 'poisson'],
    'splitter': ['best', 'random'],
    'max_depth': [13, 14, 15, 16, 17, 18, 19],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [None]:
tuning_model=GridSearchCV(tree, param_grid=parameters, scoring=mean_squared_error, cv=5)
tuning_model.fit(X_train, y_train)
print('Is tuned')
print(tuning_model.best_params_)

Is tuned
{'criterion': 'friedman_mse', 'max_depth': 13, 'max_features': 'auto', 'splitter': 'best'}

In [99]:
tree = DecisionTreeRegressor(criterion='friedman_mse', max_depth=13, max_features='auto', splitter='best')
tree.fit(X_train, y_train)
y_preds = tree.predict(X_test)
score=tree.score(X_test, y_test)
print(score)
mse = mean_squared_error(y_test, y_preds)
print("Mean Squared Error:", mse)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

0.25800096036424114
Mean Squared Error: 1.2230168351857444
Root Mean Squared Error: 1.1059009156275008


In [106]:
tree = DecisionTreeRegressor(criterion='friedman_mse', max_depth=18, max_features='auto')
tree.fit(X_train, y_train)
y_preds = tree.predict(X_test)
score=tree.score(X_test, y_test)
print(score)
mse = mean_squared_error(y_test, y_preds)
print("Mean Squared Error:", mse)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

0.2789917628875146
Mean Squared Error: 1.1884182663215246
Root Mean Squared Error: 1.0901459839496381
