In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import hstack
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib

In [2]:
dataurl = 'https://docs.google.com/spreadsheets/d/1EmeVCJzCiMISggPuS5clv-gz4mznq24NMyFV8aXkbK4/edit?gid=0#gid=0'
dataurl = dataurl.replace('/edit?gid=', '/export?format=csv&gid=')
dataset = pd.read_csv(dataurl)
dataset = dataset.fillna('')
dataset

Unnamed: 0,My Gender,My age,My city of residence,"I define privilege to be (having a supportive family, dependable friends, a job you love, anything else)","About my growing up years (they were awesome, they were challenging but all is good now, about the family, about friends, memories etc etc)",My current relationship status,I have children,Ball park of my professional annual income,"How do I introduce the professional me (banker, IT professional, entrepreneur, environmentalist)",Name of the institute I graduated from last.,Educational degree,Anything else that you may like to mention to the community members (any specifics which may be important to you),I give my consent to share my details (barring my contact number) with a potential match.
0,Male,32,Mumbai,Having things that money cant buy.,"Defense upbringing, kept moving cities and sch...","Separated, filed for a divorce",No,Between 40-50 Lakhs,Finance Professional,Certified financial analyst CFA,Masters,Health concious due to health conditions in fa...,I give consent
1,Male,36,New Delhi,"Family,friends",Awesome,Never married,No,Between 30-35 lakhs,Education institute owner,Institute of chartered accountants of India,Masters,I like playing sports,I give consent
2,Male,32,Bangalore,Supportive/understandable/Friendly,"Awesome & challenging, All Good Now.",Never married,No,Between 12-15 Lakhs,Engineer (Interior Design),"RRCE, Bangalore",Masters,No,I give consent
3,Female,33,Gurugram,Being able to love people and be loved by them,We’re full of curiosity and learning I was an ...,Was married earlier,No,Between 18-20 Lakhs,Mental health professional,PGI Rohtak,Phd and equivalent,I take time to open up and quite curious,I give consent
4,Female,32,Mumbai,Being able to choose and make a life for myself,"Coming from a humble background, it has taught...",Never married,No,Between 18-20 Lakhs,Producer in Advertising,XIC,PG Diploma,I believe honest communication and mutual resp...,I give consent
...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,Male,37,UDAIPUR,I am privileged to have supporting parent who ...,I am a kind of person who rather takes a decis...,Was married earlier,Yes,Between 24-30 Lakhs,SELF EMPLYED,University of Pune - MIT School of Management,Masters,I am very close to my parents.,I give consent
103,Male,32,Gurgaon,Being able to follow your dreams. Having frien...,I live with my mom. She was my coach growing u...,Never married,No,Between 50-70 Lakhs,Senior Manager at a big 4 Consulting firm,Delhi Technological University,Undergraduate,I respect old school values. I also value bein...,I give consent
104,Female,41,Bengaluru,"As having a supportive ecosystem of friends, m...","Born and raised in Kolkata, but lived in 10+ c...",Was married earlier,No,Between 24-30 Lakhs,Engineer and Management Consult turned Entrepr...,Centre for Creative Leadership (CCL),Masters,I like to lead an active healthy lifestyle. Lo...,I give consent
105,Male,39,Tampa USA,Privilege is not having to worry about money f...,"Very happy childhood with grandparents, immedi...",Never married,No,Over 90 Lakhs,IT pro,Goa University,Undergraduate,I've lived in the US for over a decade so my w...,I give consent


In [None]:
text_columns = ["I define privilege to be (having a supportive family, dependable friends, a job you love, anything else)",
 "About my growing up years  (they were awesome, they were challenging but all is good now, about the family, about friends, memories etc etc)",
 "Anything else that you may like to mention to the community members (any specifics which may be important to you)"]
categorical_columns = ["My city of residence", "My current relationship status", "I have children",
                       "How do I introduce the professional me (banker, IT professional, entrepreneur, environmentalist)",
                       "Name of the institute I graduated from last.", "Educational degree",
                       'Ball park of my professional annual income']
numerical_columns = ['My age']

In [None]:
dataset['Anything else that you may like to mention to the community members (any specifics which may be important to you)']

In [None]:
tfidf_vectorizers = {}
tfidf_features = []
for column in text_columns:
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(dataset[column])
    tfidf_vectorizers[column] = vectorizer
    tfidf_features.append(tfidf_matrix)



In [None]:
label_encoders = {}
encoded_categorical_features = []

for column in categorical_columns:
    le = LabelEncoder()
    encoded_column = le.fit_transform(dataset[column])
    label_encoders[column] = le
    encoded_categorical_features.append(encoded_column.reshape(-1, 1))

In [None]:
scaler = StandardScaler()
scaled_numerical_features = scaler.fit_transform(dataset[numerical_columns])

In [None]:
features = hstack(tfidf_features + encoded_categorical_features + [scaled_numerical_features])

In [None]:
features

In [None]:
k = 80  # Number of neighbors
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
knn.fit(features)

In [None]:
current_row = 9
selectedRow = dataset.iloc[current_row]
selectedRowFeatures = features.getrow(current_row)
print(selectedRow.values)

In [None]:
distances, indices = knn.kneighbors(selectedRowFeatures, n_neighbors=k)
print(indices)

In [None]:
for i in indices[0]:
    if(dataset.iloc[i]['My Gender'] == selectedRow["My Gender"]):
        continue
    print(dataset.iloc[i].values)
    print("")
    print("")


In [None]:
joblib.dump(knn, 'v1_knn.joblib')

In [None]:
dataset.iloc[9]

In [None]:
tfidf_vectorizers = {
    "I define privilege to be (having a supportive family, dependable friends, a job you love, anything else)": TfidfVectorizer(),
    "About my growing up years  (they were awesome, they were challenging but all is good now, about the family, about friends, memories etc etc)": TfidfVectorizer(),
    "Anything else that you may like to mention (any specifics which may be important to you)": TfidfVectorizer()
}  # Dictionary of trained TF-IDF vectorizers
label_encoders = {
    "My Gender": LabelEncoder(),
    "My city of residence": LabelEncoder(),
    "My current relationship status": LabelEncoder(),
    "I have children": LabelEncoder(),
    "How do I introduce the professional me (banker, IT professional, entrepreneur, environmentalist)": LabelEncoder(),
    "Name of the institute I graduated from last.": LabelEncoder(),
    "Educational degree": LabelEncoder()
}  # Dictionary of trained LabelEncoders
scaler = StandardScaler()  # Trained StandardScaler

# Specify the columns
text_columns = [
    "I define privilege to be (having a supportive family, dependable friends, a job you love, anything else)",
    "About my growing up years  (they were awesome, they were challenging but all is good now, about the family, about friends, memories etc etc)",
    "Anything else that you may like to mention (any specifics which may be important to you)"
]
categorical_columns = [
    "My Gender", "My city of residence", "My current relationship status", "I have children",
    "How do I introduce the professional me (banker, IT professional, entrepreneur, environmentalist)",
    "Name of the institute I graduated from last.", "Educational degree", 'Ball park of my professional annual income'
]
numerical_columns = ['My age']
marital_status_column = 'My current relationship status'

def get_features_array(new_data):
    # Initialize lists to store features
    all_features = []
    
    for idx, new_row in new_data.iterrows():
        # Preprocess text columns
        tfidf_features = []
        for column in text_columns:
            tfidf_vectorizer = tfidf_vectorizers[column]
            tfidf_matrix = tfidf_vectorizer.transform([new_row[column]])
            tfidf_matrix *= text_weight  # Scale by text weight
            tfidf_features.append(tfidf_matrix)
        
        # Encode categorical variables
        encoded_categorical_features = []
        for column in categorical_columns:
            le = label_encoders[column]
            encoded_column = le.transform([new_row[column]])
            encoded_column = encoded_column.reshape(-1, 1) * categorical_weight  # Scale by categorical weight
            encoded_categorical_features.append(encoded_column)
        
        # Encode marital status
        le_marital_status = label_encoders[marital_status_column]
        encoded_marital_status = le_marital_status.transform([new_row[marital_status_column]])
        encoded_marital_status = encoded_marital_status.reshape(-1, 1) * marital_status_weight
        
        # Standardize numerical features
        numerical_data = np.array([[new_row[col] for col in numerical_columns]])
        scaled_numerical_features = scaler.transform(numerical_data)
        scaled_numerical_features[:, 0] *= age_weight  # Scale age
        scaled_numerical_features[:, 1] *= income_weight  # Scale income
        
        # Combine all features into a single feature matrix
        features = hstack(tfidf_features + encoded_categorical_features + [encoded_marital_status, scaled_numerical_features])
        all_features.append(features)
    
    # Stack all features
    all_features = hstack(all_features)
    
    return all_features


In [None]:
get_features_array(dataset.iloc[9])

In [None]:
dataset

In [None]:
def testingStuff(dataset):
    # label encoding
    le = LabelEncoder()
    dataset['My Gender'] = le.fit_transform(dataset['My Gender'])
    cityle = LabelEncoder()
    Dataset. list(cityle.fit_transform(dataset['My city of residence']))

    return dataset.head(5)



testingStuff(dataset)

In [9]:
data = dataset

text_columns = ["I define privilege to be (having a supportive family, dependable friends, a job you love, anything else)",
 "About my growing up years  (they were awesome, they were challenging but all is good now, about the family, about friends, memories etc etc)",
 "Anything else that you may like to mention to the community members (any specifics which may be important to you)"]
categorical_columns = ["My city of residence", "My current relationship status", "I have children",
                       "How do I introduce the professional me (banker, IT professional, entrepreneur, environmentalist)",
                       "Name of the institute I graduated from last.", "Educational degree",
                       'Ball park of my professional annual income']
numerical_columns = ['My age']

# Vectorize text columns using TF-IDF
tfidf_vectorizers = {}
tfidf_features = []

for column in text_columns:
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(data[column])
    tfidf_vectorizers[column] = vectorizer
    tfidf_features.append(tfidf_matrix)

# Encode categorical variables using LabelEncoder
label_encoders = {}
encoded_categorical_features = []

for column in categorical_columns:
    le = LabelEncoder()
    encoded_column = le.fit_transform(data[column])
    label_encoders[column] = le
    encoded_categorical_features.append(encoded_column.reshape(-1, 1))

# Standardize numerical features
scaler = StandardScaler()
scaled_numerical_features = scaler.fit_transform(data[numerical_columns])

# Combine all features into a single feature matrix
features = hstack(tfidf_features + encoded_categorical_features + [scaled_numerical_features])

print(features.getrow(9))

  (0, 2069)	-0.45825806501711636
  (0, 2068)	1.0
  (0, 2067)	1.0
  (0, 2066)	8.0
  (0, 2065)	6.0
  (0, 2062)	2.0
  (0, 2010)	0.4426781815690859
  (0, 1990)	0.4426781815690859
  (0, 1984)	0.1617526973565497
  (0, 1859)	0.4426781815690859
  (0, 1733)	0.2939013533962552
  (0, 1659)	0.23046931029479636
  (0, 1591)	0.3010587324304155
  (0, 1436)	0.36084579882563755
  (0, 1345)	0.1600124625374906
  (0, 1309)	0.15117940639272776
  (0, 1257)	0.11146839139600874
  (0, 1234)	0.11007030467348353
  (0, 1229)	0.2531086804305795
  (0, 1212)	0.1606692312077669
  (0, 1171)	0.2754990621115305
  (0, 1145)	0.2754990621115305
  (0, 1094)	0.2754990621115305
  (0, 1077)	0.2531086804305795
  (0, 1066)	0.2531086804305795
  :	:
  (0, 282)	0.12522232571867786
  (0, 273)	0.13629968451588656
  (0, 245)	0.13629968451588656
  (0, 238)	0.07948907468934761
  (0, 237)	0.13629968451588656
  (0, 228)	0.13629968451588656
  (0, 225)	0.17945149564413537
  (0, 214)	0.13629968451588656
  (0, 203)	0.13629968451588656
  (0, 16