## ${\textbf{Libraries}}$

In [180]:
import numpy as np
import pandas as pd
import scipy

#These are the visualization libraries. Matplotlib is standard and is what most people use.
#Seaborn works on top of matplotlib, as we mentioned in the course.
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set()predict

#For standardizing features. We'll use the StandardScaler module.
from sklearn.preprocessing import StandardScaler

#Hierarchical clustering with the Sci Py library. We'll use the dendrogram and linkage modules.
from scipy.cluster.hierarchy import dendrogram, linkage
#Sk learn is one of the most widely used libraries for machine learning. We'll use the k means and pca modules.
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
# We need to save the models, which we'll use in the next section. We'll use pickle for that.
import pickle

import warnings
warnings.filterwarnings('ignore')

## ${\textbf{Import Data}}$

In [181]:
# Load the data, contained in the segmentation data csv file.
df= pd.read_csv('segmentation data.csv', index_col = 0)

In [182]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 100000001 to 100002000
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Sex              2000 non-null   int64
 1   Marital status   2000 non-null   int64
 2   Age              2000 non-null   int64
 3   Education        2000 non-null   int64
 4   Income           2000 non-null   int64
 5   Occupation       2000 non-null   int64
 6   Settlement size  2000 non-null   int64
dtypes: int64(7)
memory usage: 125.0 KB


## ${\textbf{Correlation Estimate}}$

In [183]:
# Compute Pearson correlation coefficient for the features in our data set.
# The correlation method in pandas, it has the Pearson correlation set as default.
df.corr()

Unnamed: 0,Sex,Marital status,Age,Education,Income,Occupation,Settlement size
Sex,1.0,0.566511,-0.182885,0.244838,-0.195146,-0.202491,-0.300803
Marital status,0.566511,1.0,-0.213178,0.374017,-0.073528,-0.02949,-0.097041
Age,-0.182885,-0.213178,1.0,0.654605,0.34061,0.108388,0.119751
Education,0.244838,0.374017,0.654605,1.0,0.233459,0.064524,0.034732
Income,-0.195146,-0.073528,0.34061,0.233459,1.0,0.680357,0.490881
Occupation,-0.202491,-0.02949,0.108388,0.064524,0.680357,1.0,0.571795
Settlement size,-0.300803,-0.097041,0.119751,0.034732,0.490881,0.571795,1.0


## ${\textbf{Standardization}}$

In [184]:
# Standardizing data, so that all features have equal weight. This is important for modelling.
# Otherwise, in our case Income would be considered much more important than Education for Instance. 
# We do not know if this is the case, so we would not like to introduce it to our model. 
# This is what is also refered to as bias.
scaler = StandardScaler()
df_std = scaler.fit_transform(df)

## ${\textbf{K-means Clustering}}$

In [185]:
# Perform K-means clustering. We consider 1 to 10 clusters, so our for loop runs 10 iterations.
# In addition we run the algortihm at many different starting points - k means plus plus. 
# And we set a random state for reproducibility.
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(df_std)
    wcss.append(kmeans.inertia_)

In [186]:
# We run K-means with a fixed number of clusters. In our case 4.
kmeans = KMeans(n_clusters = 4, init = 'k-means++', random_state = 42)

In [187]:
# We divide our data into the four clusters.
kmeans.fit(df_std)

KMeans(n_clusters=4, random_state=42)

### ${\textbf{Results}}$

In [188]:
# We create a new data frame with the original features and add a new column with the assigned clusters for each point.
# df_segm_kmeans= df_std.copy()
# df_segm_kmeans = pd.DataFrame(data = df_std,columns = df.columns)
# df_segm_kmeans['Segment K-means'] = kmeans.labels_

In [189]:
# df_segm_kmeans.head()

In [192]:
# final = pd.concat([df.reset_index(drop = True)], axis = 1)
final=df.reset_index(drop = True)
final['Segment K-means'] = kmeans.labels_
final.head()

Unnamed: 0,Sex,Marital status,Age,Education,Income,Occupation,Settlement size,Segment K-means
0,0,0,67,2,124670,1,2,0
1,1,1,22,1,150773,1,2,2
2,0,0,49,1,89210,0,0,1
3,0,0,45,1,171565,1,1,3
4,0,0,53,1,149031,1,1,3


In [140]:
x=final.iloc[:,[0,1,2,4,5,6]].values
y=final['Segment K-means'].values

In [141]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x,y, test_size=0.2, random_state=0)


In [142]:
# print(train_y)

In [143]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=13)
model.fit(train_x, train_y)

RandomForestClassifier(random_state=13)

In [144]:
predicted = model.predict(test_x)
model.score(test_x, test_y)

0.9875

In [134]:
from sklearn.metrics import roc_auc_score
probabilities = model.predict_proba(test_x)

In [135]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_y, predicted)

array([[ 36,   1,   2,   9],
       [  2,  41,  34,  18],
       [  3,  23, 108,   8],
       [  3,   8,  11,  93]], dtype=int64)

In [136]:
from sklearn.metrics import precision_score

train_predictions = model.predict(train_x)
precision_score(train_y, train_predictions, average='micro')

1.0

In [137]:
# print(train_x[100])
# train_predictions = model.predict_proba([train_x[100]])
# print(train_predictions)

In [138]:
import pickle
pickle.dump(model, open('model3.pkl', 'wb'))

In [139]:
pickled_model = pickle.load(open('model3.pkl', 'rb'))
Sex=0
# Marital_status=0
Age=45
# Education=1
Income=171565
Occupation=1
Settlement_size=1
# Component_1 =0.344935
# Component_2=-0.579927
# Component_3=2.211603
predict=pickled_model.predict([[Sex,Age,Income]])
print(predict)

[3]
