# Using Classification Models to Add a New Dating Profile

## Importing Necessary Libraries

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 500)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import _pickle as pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import calinski_harabaz_score, silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook as tqdm

### Loading the profiles

In [2]:
# Loading in the cleaned DF
with open("profiles.pkl",'rb') as fp:
    raw_df = pickle.load(fp)

# Viewing the DF    
raw_df.head()

Unnamed: 0,Bios,Drinks,Perfume,Vacation,Personal Style,Music,Hobby,Food,Scent,Feel
0,Passionate analyst. Incurable pop culture practitioner. Creator. Total troublemaker. Subtly charming food lover. Bacon nerd. Writer.,7,10,5,9,7,6,11,10,8
1,Organizer. Hardcore web guru. Certified coffee ninja. Amateur explorer. Tv maven. Wannabe twitter practitioner. Food expert. Gamer.,12,4,2,1,12,1,4,1,6
2,Hipster-friendly social media scholar. Professional student. Tv junkie. Pop culture practitioner. Reader. Beer fanatic.,8,10,6,3,1,4,2,8,2
3,Writer. Introvert. Beer aficionado. Music specialist. Hipster-friendly tv fanatic.,4,1,9,4,10,7,1,4,12
4,Creator. Unable to type with boxing gloves on. General communicator. Troublemaker. Alcohol geek.,11,9,1,5,2,11,5,1,7


### Loading the Clustered profiles

In [3]:
# Loading in the clustered DF
with open("clustered_profiles.pkl",'rb') as fp:
    cluster_df = pickle.load(fp)

# Viewing the DF    
cluster_df.tail()

Unnamed: 0,Bios,Drinks,Perfume,Vacation,Personal Style,Music,Hobby,Food,Scent,Feel,Cluster Number
14995,Coffee expert. Unapologetic twitter guru. Avid tv scholar. Internet trailblazer. Alcohol advocate.,8.0,0.0,4.0,8.0,9.0,12.0,2.0,12.0,4.0,0
14996,Freelance bacon enthusiast. Infuriatingly humble beer guru. Web lover. Reader. Amateur tv maven.,3.0,9.0,9.0,6.0,2.0,12.0,12.0,6.0,5.0,5
14997,Hardcore twitter practitioner. Extreme web expert. Pop culture evangelist.,7.0,6.0,6.0,11.0,5.0,2.0,11.0,0.0,11.0,1
14998,Hardcore tv guru. Troublemaker. Typical food lover. Beer expert. Entrepreneur. Analyst. Extreme creator.,8.0,7.0,7.0,1.0,11.0,8.0,1.0,8.0,3.0,7
14999,Beer evangelist. Entrepreneur. Certified social media guru. Coffee trailblazer. Music nerd. Pop culture ninja.,9.0,10.0,12.0,7.0,3.0,9.0,1.0,1.0,7.0,14


## Creating the New Profile Data

In [4]:
# Instantiating a new DF row to append later
new_profile = pd.DataFrame(columns=raw_df.columns)

# Adding random values for new data
for i in new_profile.columns[1:]:
    new_profile[i] = np.random.randint(0,10,1)

# Printing an user interface for inputting new values
print("Enter new profile information...\n\nExample Bio:\nBacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.")

# Asking for new profile data
new_profile['Bios'] = input("Enter a Bio for yourself: ")

# Indexing that new profile data
new_profile.index = [raw_df.index[-1] + 1]

Enter new profile information...

Example Bio:
Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.
Enter a Bio for yourself:  food lover. social media fanatic. extraordinarily humble. life lover.


In [5]:
new_profile

Unnamed: 0,Bios,Drinks,Perfume,Vacation,Personal Style,Music,Hobby,Food,Scent,Feel
15000,food lover. social media fanatic. extraordinarily humble. life lover.,5,6,6,5,2,5,5,8,9


## Classification of the New Profile

### Importing the Classification Models

In [6]:
# Importing 3 models
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

### Vectorizing the Data

In [8]:
# Assigning the split variables
X = cluster_df.drop(["Cluster Number"], 1)
y = cluster_df['Cluster Number']

## Vectorizing
# Instantiating the Vectorizer
vectorizer = CountVectorizer()

# Fitting the vectorizer to the Bios
x = vectorizer.fit_transform(X['Bios'])

# Creating a new DF that contains the vectorized words
df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

# Concating the words DF with the original DF
X = pd.concat([X, df_wrds], axis=1)

# Dropping the Bios because it is no longer needed in place of vectorization
X.drop(['Bios'], axis=1, inplace=True)

### Scaling the Data

In [9]:
# Scaling the Data
scaler = MinMaxScaler()

X = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

### Preparing the New Profile Data
For Vectorization purposes, the new profile will have to be able to fit into trained data (has to have the same columns).

Two Options:
1. __Vectorized the New Profile data with the vectorizer fitted to the dataset as to not include potentially new vocabulary. _(Keeps dimensionality the same)___
2. Vectorized the New Profile data with a new vectorizer fitted to it in order to include new vocabulary. _(Increases dimensionality with every new piece of data)_

### Vectorizing and scaling the new Data

In [10]:
# Vectorizing the new data
vect_new_prof = vectorizer.transform(new_profile['Bios'])

# Quick DF of the vectorized words
new_vect_w = pd.DataFrame(vect_new_prof.toarray(), columns=vectorizer.get_feature_names(), index=new_profile.index)

# Concatenating the DFs for the new profile data
new_vect_prof = pd.concat([new_profile, new_vect_w], 1).drop('Bios', 1)

# Scaling the new profile data
new_vect_prof = pd.DataFrame(scaler.transform(new_vect_prof), columns=new_vect_prof.columns, index=new_vect_prof.index)

In [12]:
new_vect_prof

Unnamed: 0,Drinks,Perfume,Vacation,Personal Style,Music,Hobby,Food,Scent,Feel,advocate,...,unable,unapologetic,wannabe,web,webaholic,winning,with,writer,zombie,zombieaholic
15000,0.416667,0.5,0.5,0.416667,0.166667,0.416667,0.416667,0.666667,0.75,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Train-Test Splitting

In [13]:
# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Finding the Best Model
- Dummy (Baseline Model)
- KNN
- SVM

In [14]:
# Dummy
dummy = DummyClassifier(strategy='stratified')

# KNN
knn = KNeighborsClassifier()

# SVM
svm = SVC()

# List of models
models = [dummy, knn, svm]

# List of model names
names = ['Dummy', 'KNN', 'SVM']

# Zipping the lists
classifiers = dict(zip(names, models))

Since we are dealing with an imbalanced dataset _(because each cluster is not guaranteed to have the same amount of profiles)_, we will resort to using the __Macro Avg__ and __F1 Score__ for evaluating the performances of each model.

In [15]:
# Dictionary containing the model names and their scores
models_f1 = {}

# Looping through each model's predictions and getting their classification reports
for name, model in classifiers.items():
    # Fitting the model
    model.fit(X_train, y_train)
    
    print('\n'+ name + ' (Macro Avg - F1 Score):')
    
    # Classification Report
    report = classification_report(y_test, model.predict(X_test), output_dict=True)
    f1 = report['macro avg']['f1-score']
    
    # Assigning to the Dictionary
    models_f1[name] = f1
    
    print(f1)


Dummy (Macro Avg - F1 Score):
0.057383581494719255

KNN (Macro Avg - F1 Score):
1.0





SVM (Macro Avg - F1 Score):
0.9979457785959334


#### As KNN has the highest score out of the classification Models, we use KNN to classify our new profile|

### Using KNN to Classify the new Profile

In [16]:
# Fitting the model
knn.fit(X, y)

# Predicting the New Profile data by determining which Cluster it would belong to
designated_cluster = knn.predict(new_vect_prof)

designated_cluster

array([2], dtype=int64)

### DataFrame with the new clustered Profile

In [17]:
des_cluster = cluster_df[cluster_df['Cluster Number']==designated_cluster[0]]

des_cluster

Unnamed: 0,Bios,Drinks,Perfume,Vacation,Personal Style,Music,Hobby,Food,Scent,Feel,Cluster Number
1532,Award-winning coffee practitioner. Typical organizer. Professional internet geek. Passionate pop culture guru. Hardcore beer fanatic.,8.0,1.0,5.0,9.0,5.0,0.0,10.0,5.0,12.0,2
1539,Internet advocate. Organizer. Bacon fanatic. Gamer. Explorer. Award-winning zombie geek.,9.0,7.0,1.0,12.0,8.0,5.0,0.0,12.0,11.0,2
1547,Award-winning coffee practitioner. Typical organizer. Professional internet geek. Passionate pop culture guru. Hardcore beer fanatic.,9.0,1.0,4.0,3.0,0.0,0.0,5.0,1.0,1.0,2
1554,Internet advocate. Organizer. Bacon fanatic. Gamer. Explorer. Award-winning zombie geek.,8.0,3.0,1.0,2.0,4.0,0.0,1.0,1.0,3.0,2
1562,Award-winning coffee practitioner. Typical organizer. Professional internet geek. Passionate pop culture guru. Hardcore beer fanatic.,5.0,2.0,6.0,1.0,3.0,11.0,11.0,7.0,7.0,2
...,...,...,...,...,...,...,...,...,...,...,...
14932,Devoted twitter maven. Award-winning travel nerd. Writer. Amateur music fanatic. Tv expert. Extreme zombie evangelist.,6.0,0.0,11.0,2.0,3.0,2.0,8.0,6.0,2.0,2
14947,Devoted twitter maven. Award-winning travel nerd. Writer. Amateur music fanatic. Tv expert. Extreme zombie evangelist.,8.0,9.0,5.0,12.0,12.0,4.0,4.0,0.0,7.0,2
14962,Devoted twitter maven. Award-winning travel nerd. Writer. Amateur music fanatic. Tv expert. Extreme zombie evangelist.,9.0,1.0,12.0,6.0,7.0,8.0,11.0,1.0,1.0,2
14977,Devoted twitter maven. Award-winning travel nerd. Writer. Amateur music fanatic. Tv expert. Extreme zombie evangelist.,3.0,2.0,9.0,2.0,4.0,0.0,6.0,9.0,1.0,2


## Finding the Top 10 Matching Profiles of our New Profile

In [18]:
# Appending the new profile data
des_cluster = des_cluster.append(new_profile, sort=False)

# Fitting the vectorizer to the Bios
cluster_x = vectorizer.fit_transform(des_cluster['Bios'])

# Creating a new DF that contains the vectorized words
cluster_v = pd.DataFrame(cluster_x.toarray(), index=des_cluster.index, columns=vectorizer.get_feature_names())

# Joining the Vectorized DF to the previous DF and dropping columns
des_cluster = des_cluster.join(cluster_v).drop(['Bios', 'Cluster Number'], axis=1)

des_cluster

Unnamed: 0,Drinks,Perfume,Vacation,Personal Style,Music,Hobby,Food,Scent,Feel,advocate,...,troublemaker,tv,twitter,typical,unapologetic,wannabe,web,winning,writer,zombie
1532,8.0,1.0,5.0,9.0,5.0,0.0,10.0,5.0,12.0,0,...,0,0,0,1,0,0,0,1,0,0
1539,9.0,7.0,1.0,12.0,8.0,5.0,0.0,12.0,11.0,1,...,0,0,0,0,0,0,0,1,0,1
1547,9.0,1.0,4.0,3.0,0.0,0.0,5.0,1.0,1.0,0,...,0,0,0,1,0,0,0,1,0,0
1554,8.0,3.0,1.0,2.0,4.0,0.0,1.0,1.0,3.0,1,...,0,0,0,0,0,0,0,1,0,1
1562,5.0,2.0,6.0,1.0,3.0,11.0,11.0,7.0,7.0,0,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14947,8.0,9.0,5.0,12.0,12.0,4.0,4.0,0.0,7.0,0,...,0,1,1,0,0,0,0,1,1,1
14962,9.0,1.0,12.0,6.0,7.0,8.0,11.0,1.0,1.0,0,...,0,1,1,0,0,0,0,1,1,1
14977,3.0,2.0,9.0,2.0,4.0,0.0,6.0,9.0,1.0,0,...,0,1,1,0,0,0,0,1,1,1
14992,4.0,7.0,6.0,8.0,1.0,4.0,9.0,10.0,4.0,0,...,0,1,1,0,0,0,0,1,1,1


#### Correlations to Similar Profiles

In [20]:
# Finding the Top 10 similar or correlated users to the new user
user_n = new_profile.index[0]

# Trasnposing the DF so that we are correlating with the index(users) and finding the correlation
corr = des_cluster.T.corrwith(des_cluster.loc[user_n])

# Creating a DF with the Top 10 most similar profiles
top_10_sim = corr.sort_values(ascending=False)[1:11]

top_10_sim

6201     0.951051
9754     0.946710
5374     0.941760
8146     0.938373
6883     0.934954
10860    0.934073
8142     0.931952
11864    0.931194
6924     0.928821
5278     0.928735
dtype: float64

### Top 10 Similar Profiles

In [21]:
raw_df.loc[top_10_sim.index]

Unnamed: 0,Bios,Drinks,Perfume,Vacation,Personal Style,Music,Hobby,Food,Scent,Feel
6201,Coffee lover. Introvert. Food fanatic. Analyst. Bacon advocate. Extreme twitter fanatic.,6,6,9,10,1,5,5,9,10
9754,Devoted writer. Lifelong explorer. Friendly gamer. Award-winning twitter specialist. Communicator.,6,5,7,3,2,7,7,11,12
5374,Analyst. Hardcore social media aficionado. Award-winning internet advocate. Student. Troublemaker.,11,12,11,6,3,4,6,12,11
8146,Coffeeaholic. Award-winning writer. Professional travel guru. Food fanatic. Communicator. Internet trailblazer.,9,10,5,6,2,6,7,10,8
6883,Troublemaker. Extreme organizer. Total explorer. Coffee advocate. Award-winning beer geek.,6,8,10,10,1,7,9,11,8
10860,Amateur analyst. Internet ninja. Extreme social media maven. Award-winning alcohol nerd.,8,6,6,3,3,3,6,12,10
8142,Zombie practitioner. Award-winning web nerd. Infuriatingly humble food fanatic. Social media lover.,12,9,9,6,5,10,12,10,9
11864,Award-winning coffee advocate. Bacon fanatic. Gamer. Introvert. Freelance analyst.,11,5,7,7,3,11,10,11,11
6924,Future teen idol. Alcohol practitioner. Food geek. Professional twitter advocate. Award-winning tv scholar. Proud organizer.,6,7,12,4,3,5,7,8,10
5278,Professional social media advocate. Twitter aficionado. Alcohol fanatic. Wannabe coffee lover. Beer nerd. Travel enthusiast.,8,12,5,4,2,10,11,10,11


## Saving the Classfication Model for Future Use

In [22]:
from joblib import dump

dump(knn, "clf_model.joblib")

['clf_model.joblib']