# Adding a New Dating Profile
Using Classification or Clustering for a New Dating Profile

### Importing Libraries and Data

In [75]:
import pandas as pd
pd.set_option('display.max_colwidth', 500)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import _pickle as pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import calinski_harabaz_score, silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook as tqdm

#### Loading the Profiles

In [2]:
# Loading in the cleaned DF
with open("profiles.pkl",'rb') as fp:
    raw_df = pickle.load(fp)

# Viewing the DF    
raw_df.head()

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
0,Typical twitter fanatic. Infuriatingly humble thinker. Lifelong coffee practitioner. Organizer.,5,3,4,1,3,6,7
1,Web junkie. Analyst. Infuriatingly humble introvert. Food nerd. Lifelong music fanatic. Coffee lover.,7,9,5,1,9,4,0
2,Avid web maven. Food practitioner. Gamer. Twitter fanatic. Pop culture scholar. Zombie evangelist.,1,2,6,5,6,5,4
3,Twitteraholic. Extreme web fanatic. Food buff. Infuriatingly humble entrepreneur.,5,2,7,8,2,6,6
4,Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.,6,6,6,4,3,6,3


#### Loading the Clustered Profiles

In [3]:
# Loading in the clustered DF
with open("clustered_profiles.pkl",'rb') as fp:
    cluster_df = pickle.load(fp)

# Viewing the DF    
cluster_df.head()

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics,Cluster #
0,Typical twitter fanatic. Infuriatingly humble thinker. Lifelong coffee practitioner. Organizer.,5.0,3.0,4.0,1.0,3.0,6.0,7.0,9
1,Web junkie. Analyst. Infuriatingly humble introvert. Food nerd. Lifelong music fanatic. Coffee lover.,7.0,9.0,5.0,1.0,9.0,4.0,0.0,9
2,Avid web maven. Food practitioner. Gamer. Twitter fanatic. Pop culture scholar. Zombie evangelist.,1.0,2.0,6.0,5.0,6.0,5.0,4.0,1
3,Twitteraholic. Extreme web fanatic. Food buff. Infuriatingly humble entrepreneur.,5.0,2.0,7.0,8.0,2.0,6.0,6.0,9
4,Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.,6.0,6.0,6.0,4.0,3.0,6.0,3.0,10


## Creating the New Profile Data

In [31]:
# Instantiating a new DF row to append later
new_profile = pd.DataFrame(columns=raw_df.columns)

# Adding random values for new data
for i in new_profile.columns[1:]:
    new_profile[i] = np.random.randint(0,10,1)

# Printing an user interface for inputting new values
print("Enter new profile information...\n\nExample Bio:\nBacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.")

# Asking for new profile data
new_profile['Bios'] = input("Enter a Bio for yourself: ")

# Indexing that new profile data
new_profile.index = [raw_df.index[-1] + 1]

Enter new profile information...

Example Bio:
Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.
Enter a Bio for yourself: food lover. social media fanatic. extraordinarily humble. life lover.


### The New Data

In [32]:
new_profile

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
6600,food lover. social media fanatic. extraordinarily humble. life lover.,4,9,3,6,6,0,3


# Two Approaches
1. Cluster all the profiles again with the new profile

2. Classify the new profile with a classification model trained on our previously clustered data

## Clustering the New Profile Data

In [35]:
# Appending the new data
new_cluster = raw_df.append(new_profile)

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
6596,Avid web junkie. Lifelong alcohol guru. Hardcore reader. Award-winning twitter evangelist.,4,3,6,3,7,7,2
6597,Music ninja. Bacon fanatic. Reader. Total communicator. Unapologetic beer specialist.,1,4,0,4,9,2,5
6598,Communicator. Bacon lover. Award-winning introvert. Amateur internet ninja.,6,2,0,3,8,9,1
6599,Unapologetic tv aficionado. Devoted twitter enthusiast. Typical coffee guru. Falls down a lot.,2,1,8,7,0,5,5
6600,food lover. social media fanatic. extraordinarily humble. life lover.,4,9,3,6,6,0,3


### Scaling

In [37]:
# Instantiating the Scaler
scaler = MinMaxScaler()

# Scaling the categories then replacing the old values
df = new_cluster[['Bios']].join(pd.DataFrame(scaler.fit_transform(new_cluster.drop('Bios', axis=1)), columns=new_cluster.columns[1:], index=new_cluster.index))

  return self.partial_fit(X, y)


Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
0,Typical twitter fanatic. Infuriatingly humble thinker. Lifelong coffee practitioner. Organizer.,0.555556,0.333333,0.444444,0.111111,0.333333,0.666667,0.777778
1,Web junkie. Analyst. Infuriatingly humble introvert. Food nerd. Lifelong music fanatic. Coffee lover.,0.777778,1.000000,0.555556,0.111111,1.000000,0.444444,0.000000
2,Avid web maven. Food practitioner. Gamer. Twitter fanatic. Pop culture scholar. Zombie evangelist.,0.111111,0.222222,0.666667,0.555556,0.666667,0.555556,0.444444
3,Twitteraholic. Extreme web fanatic. Food buff. Infuriatingly humble entrepreneur.,0.555556,0.222222,0.777778,0.888889,0.222222,0.666667,0.666667
4,Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.,0.666667,0.666667,0.666667,0.444444,0.333333,0.666667,0.333333
5,Pop culture junkie. Tv buff. Reader. Friendly travel expert. Incurable twitteraholic. Social media ninja. Coffee enthusiast. Internet specialist.,0.000000,0.555556,0.777778,0.555556,1.000000,0.222222,0.000000
6,Typical thinker. Amateur explorer. Reader. Extreme student. Tv fanatic. Social media ninja.,0.555556,0.444444,0.777778,0.555556,0.000000,0.888889,0.444444
7,Zombie maven. Travel geek. Professional social media buff. Avid pop culture lover.,1.000000,0.888889,0.888889,0.222222,0.000000,0.000000,0.555556
8,Lifelong introvert. General travel maven. Hipster-friendly web trailblazer. Writer. Alcohol fan. Student. Communicator. Coffee guru.,0.888889,0.888889,0.222222,0.666667,0.111111,0.222222,1.000000
9,Travel ninja. Amateur pop culture evangelist. Web fanatic. Freelance communicator. Zombie geek.,1.000000,0.000000,0.222222,0.222222,0.555556,0.666667,0.888889


### Vectorizing

In [38]:
# Instantiating the Vectorizer
vectorizer = CountVectorizer()

# Fitting the vectorizer to the Bios
x = vectorizer.fit_transform(df['Bios'])

# Creating a new DF that contains the vectorized words
df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

# Concating the words DF with the original DF
new_df = pd.concat([df, df_wrds], axis=1)

# Dropping the Bios because it is no longer needed in place of vectorization
new_df.drop('Bios', axis=1, inplace=True)

### PCA

In [39]:
from sklearn.decomposition import PCA

# Instantiating PCA
pca = PCA()

# Fitting and Transforming the DF
df_pca = pca.fit_transform(new_df)

# Finding the exact number of features that explain at least 99% of the variance in the dataset
total_explained_variance = pca.explained_variance_ratio_.cumsum()
n_over_99 = len(total_explained_variance[total_explained_variance>=.99])
n_to_reach_99 = new_df.shape[1] - n_over_99

# Reducing the dataset to the number of features determined before
pca = PCA(n_components=n_to_reach_99)

# Fitting and transforming the dataset to the stated number of features
df_pca = pca.fit_transform(new_df)

# Seeing the variance ratio that still remains after the dataset has been reduced
pca.explained_variance_ratio_.cumsum()[-1]

0.9898433044921403

### Performing Hierarchical Agglomerative Clustering
- First finding the optimum number of clusters

In [42]:
# Setting the amount of clusters to test out
cluster_cnt = [i for i in range(2, 20, 1)]

# Establishing empty lists to store the scores for the evaluation metrics
ch_scores = []

s_scores = []

db_scores = []

# Looping through different iterations for the number of clusters
for i in tqdm(cluster_cnt):
    
    # Clustering with different number of clusters
    hac = AgglomerativeClustering(n_clusters=i)
    
    hac.fit(df_pca)
    
    cluster_assignments = hac.labels_
    
    # Appending the scores to the empty lists
    ch_scores.append(calinski_harabaz_score(df_pca, cluster_assignments))
    
    s_scores.append(silhouette_score(df_pca, cluster_assignments))
    
    db_scores.append(davies_bouldin_score(df_pca, cluster_assignments))

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))

  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) /




  score = (intra_dists[:, None] + intra_dists) / centroid_distances


### Helper Function to Evaluate the Clusters

In [43]:
def cluster_eval(y, x):
    """
    Prints the scores of a set evaluation metric. Prints out the max and min values of the evaluation scores.
    """
    
    # Creating a DataFrame for returning the max and min scores for each cluster
    df = pd.DataFrame(columns=['Cluster Score'], index=[i for i in range(2, len(y)+2)])
    df['Cluster Score'] = y
    
    print('Max Value:\nCluster #', df[df['Cluster Score']==df['Cluster Score'].max()])
    print('\nMin Value:\nCluster #', df[df['Cluster Score']==df['Cluster Score'].min()])

### Evaluation of Clusters

In [45]:
print("The Calinski-Harabasz Score (find max score):")
cluster_eval(ch_scores, cluster_cnt)

print("\nThe Silhouette Coefficient Score (find max score):")
cluster_eval(s_scores, cluster_cnt)

print("\nThe Davies-Bouldin Score (find minimum score):")
cluster_eval(db_scores, cluster_cnt)

The Calinski-Harabasz Score (find highest score):
Max Value:
Cluster #    Cluster Score
2     120.008244

Min Value:
Cluster #     Cluster Score
19      57.657188

The Silhouette Coefficient Score (find highest score):
Max Value:
Cluster #     Cluster Score
12        0.03048

Min Value:
Cluster #    Cluster Score
2        0.01585

The Davies-Bouldin Score (find lowest score):
Max Value:
Cluster #     Cluster Score
19       5.449615

Min Value:
Cluster #     Cluster Score
12       4.006027


### Running HAC
Again but with the optimum cluster count

In [46]:
# Instantiating HAC
hac = AgglomerativeClustering(n_clusters=12)

# Fitting
hac.fit(df_pca)

# Getting cluster assignments
cluster_assignments = hac.labels_

# Unscaling the categories then replacing the scaled values
df = df[['Bios']].join(pd.DataFrame(scaler.inverse_transform(df.drop('Bios', axis=1)), columns=df.columns[1:], index=df.index))

# Assigning the clusters to each profile
df['Cluster #'] = cluster_assignments


### Finding the Exact Cluster for our New Profile

In [66]:
# Getting the Cluster # for the new profile
profile_cluster = df.loc[new_profile.index]['Cluster #'].values[0]

# Using the Cluster # to narrow down the DF
profile_df = df[df['Cluster #']==profile_cluster].drop('Cluster #', axis=1)

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
6,Typical thinker. Amateur explorer. Reader. Extreme student. Tv fanatic. Social media ninja.,5.0,4.0,7.0,5.0,0.0,8.0,4.0
14,Travel buff. Student. Alcohol trailblazer. Passionate social media ninja. Entrepreneur. Incurable writer.,0.0,4.0,3.0,9.0,6.0,4.0,5.0
17,Beer practitioner. Coffee expert. Tv guru. Social media junkie. Bacon enthusiast. General student. Food nerd.,1.0,4.0,8.0,1.0,3.0,5.0,3.0
46,Explorer. Social media buff. Evil bacon nerd. Friendly zombie evangelist. Typical travel advocate.,4.0,9.0,7.0,6.0,0.0,6.0,2.0
52,Problem solver. Internet evangelist. Devoted social media fanatic. Passionate twitter geek.,3.0,6.0,8.0,9.0,3.0,1.0,3.0
74,Hardcore beer practitioner. Typical bacon evangelist. Certified social media buff.,3.0,7.0,8.0,2.0,1.0,0.0,7.0
85,Freelance social media nerd. Internet ninja. Introvert. Beer evangelist. Alcohol enthusiast. Bacon geek. Troublemaker.,2.0,8.0,2.0,5.0,4.0,9.0,1.0
88,Typical bacon aficionado. Internet guru. Food lover. Music enthusiast. Twitter geek. Social media ninja.,8.0,0.0,8.0,1.0,4.0,3.0,5.0
89,Introvert. Freelance music expert. Devoted social media advocate. Evil reader. Zombie fanatic. Professional analyst.,4.0,6.0,6.0,3.0,4.0,0.0,6.0
97,Thinker. Lifelong travelaholic. Alcohol enthusiast. Incurable social media lover. Communicator.,5.0,8.0,2.0,8.0,0.0,9.0,8.0


### Vectorizing the Selected Cluster

In [67]:
# Fitting the vectorizer to the Bios
cluster_x = vectorizer.fit_transform(profile_df['Bios'])

# Creating a new DF that contains the vectorized words
cluster_v = pd.DataFrame(cluster_x.toarray(), index=profile_df.index, columns=vectorizer.get_feature_names())

# Joining the Vectorized DF to the previous DF
profile_df = profile_df.join(cluster_v).drop('Bios', axis=1)

### Finding Correlation for Top 10 Similar Profiles to the New Profile

In [73]:
# Trasnposing the DF so that we are correlating with the index(users) and finding the correlation
corr = profile_df.T.corr()

# Finding the Top 10 similar or correlated users to the new user
user_n = new_profile.index[0]

# Creating a DF with the Top 10 most similar profiles
top_10_sim = corr[[user_n]].sort_values(by=[user_n],axis=0, ascending=False)[1:11]

### The Top 10 Profiles most likely to Match with the New Profile
(Sorted by descending similarity)

In [72]:
raw_df.loc[top_10_sim.index]

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
4703,Devoted reader. Bacon aficionado. Lifelong internet specialist. Food fan. Extreme twitter buff. Friendly coffee enthusiast. Social media lover.,6,9,1,9,9,1,7
4931,Coffee junkie. Social media ninja. Typical twitter specialist. Tvaholic. Student.,6,8,1,5,5,0,1
4362,Total bacon fanatic. Professional troublemaker. Proud pop culture lover. Hipster-friendly social media evangelist.,9,9,3,9,7,2,1
2718,General beer advocate. Hipster-friendly introvert. Social media nerd. Gamer. Alcohol geek. Professional writer.,2,9,1,6,8,1,2
367,Organizer. Professional alcoholaholic. Hipster-friendly social media fanatic. Total zombie evangelist. Gamer.,4,9,1,9,8,0,0
4553,Alcohol trailblazer. Passionate creator. Typical social media junkie. Avid zombie fanatic. Gamer.,3,8,5,5,9,2,2
3711,Internet lover. General social media advocate. Hardcore music maven. Web buff. Devoted tv fan. Zombie expert.,6,8,5,6,8,0,8
5480,Internet expert. Social media scholar. Hipster-friendly zombie maven. Amateur tv buff.,5,9,3,9,7,5,5
4814,Entrepreneur. Certified foodaholic. Unapologetic thinker. Incurable travel guru. Wannabe alcohol buff.,4,9,3,6,7,4,6
1823,Devoted reader. Alcoholaholic. Coffee nerd. Evil organizer. Analyst. Passionate troublemaker.,5,8,5,9,9,0,9


## Classification of the New Profile

### Importing the Different Classification Models

In [74]:
# Importing the 10 models
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

## Preventing error from occuring: XGBoost causes kernel to die.
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier

### Splitting the Data

In [76]:
X = cluster_df.drop(["Cluster #"], 1)
y = cluster_df['Cluster #']

# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Creating Model Pipelines

In [None]:
# Adaboost
pipe_ada = Pipeline([('scale', MinMaxScaler()), ('clf', AdaBoostClassifier())])

# Gradient Boost
pipe_gb  = Pipeline([('scale', MinMaxScaler()), ('clf', GradientBoostingClassifier())])

# Random Forest
pipe_rf  = Pipeline([('scale', MinMaxScaler()), ('clf', RandomForestClassifier())])

# Decision Tree
pipe_dt  = Pipeline([('scale', MinMaxScaler()), ('clf', DecisionTreeClassifier())])

# Dummy (Baseline)
pipe_dum = Pipeline([('scale', MinMaxScaler()), ('clf', DummyClassifier())])

# K Nearest Neighbors
pipe_knn = Pipeline([('scale', MinMaxScaler()), ('clf', KNeighborsClassifier())])

# Logistic Regression
pipe_lr  = Pipeline([('scale', MinMaxScaler()), ('clf', LogisticRegression())])

# Naive Bayes
pipe_nb  = Pipeline([('scale', MinMaxScaler()), ('clf', GaussianNB())])

# Support Vector Machine
pipe_svm = Pipeline([('scale', MinMaxScaler()), ('clf', SVC())])

# XGBoost
pipe_xgb = Pipeline([('scale', MinMaxScaler()), ('clf', XGBClassifier())])

# Creating a list for pipelines and model names
pipelines = [pipe_ada, pipe_gb, pipe_rf, pipe_dt, pipe_dum, 
             pipe_knn, pipe_lr, pipe_nb, pipe_svm, pipe_xgb]

models = ['Adaboost', 
          'GradientBoost', 
          'RandomForest', 
          'DecisionTree', 
          'Dummy(Baseline)', 
          'KNN', 
          'LogisticRegression',
          'NaiveBayes',
          'SupportVectorMachine',
          'XGBoost']

# Zipping the the strings and pipelines together and creating a dictionary
model_pipelines = dict(zip(models, pipelines))