In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import hstack
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib

In [2]:
dataurl = 'https://docs.google.com/spreadsheets/d/1EmeVCJzCiMISggPuS5clv-gz4mznq24NMyFV8aXkbK4/edit?gid=0#gid=0'
dataurl = dataurl.replace('/edit?gid=', '/export?format=csv&gid=')
dataset = pd.read_csv(dataurl)
dataset = dataset.fillna('')
dataset

Unnamed: 0,My Gender,My age,My city of residence,"I define privilege to be (having a supportive family, dependable friends, a job you love, anything else)","About my growing up years (they were awesome, they were challenging but all is good now, about the family, about friends, memories etc etc)",My current relationship status,I have children,Ball park of my professional annual income,"How do I introduce the professional me (banker, IT professional, entrepreneur, environmentalist)",Name of the institute I graduated from last.,Educational degree,Anything else that you may like to mention to the community members (any specifics which may be important to you),Barring my personal details (my name and contact number) shared details can be viewed by andwemet community members.,My religious inclination
0,Male,32,Mumbai,Having things that money cant buy.,"Defense upbringing, kept moving cities and sch...","Separated, filed for a divorce",No,Between 40-50 Lakhs,Finance Professional,Certified financial analyst CFA,Masters,Health concious due to health conditions in fa...,I give consent,
1,Male,36,New Delhi,"Family,friends",Awesome,Never married,No,Between 30-35 lakhs,Education institute owner,Institute of chartered accountants of India,Masters,I like playing sports,I give consent,
2,Male,32,Bangalore,Supportive/understandable/Friendly,"Awesome & challenging, All Good Now.",Never married,No,Between 12-15 Lakhs,Engineer (Interior Design),"RRCE, Bangalore",Masters,No,I give consent,
3,Female,33,Gurugram,Being able to love people and be loved by them,We’re full of curiosity and learning I was an ...,Was married earlier,No,Between 18-20 Lakhs,Mental health professional,PGI Rohtak,Phd and equivalent,I take time to open up and quite curious,I give consent,
4,Female,32,Mumbai,Being able to choose and make a life for myself,"Coming from a humble background, it has taught...",Never married,No,Between 18-20 Lakhs,Producer in Advertising,XIC,PG Diploma,I believe honest communication and mutual resp...,I give consent,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,Male,38,New Delhi,"having a supportive family, a dependable partn...","finding mysef, working hard and learnings for ...",Never married,No,Between 40-50 Lakhs,Public Policy Professional,Heriot Watt University,Masters,"simple living, high thinking; ambitious, hardw...",I give consent,Hindu
129,Male,32,Bangalore,Supportive family.,My life is bit challenging,Never married,No,Between 12-15 Lakhs,It professional,Jntu,Undergraduate,No,I give consent,Hindu
130,Male,34,Gurgaon,"Having food, roof and family and friends",I was very shy till school but changed drastic...,Never married,No,Between 50-70 Lakhs,Management Professional (Growth and strategy o...,IIM Kozhikode,Masters,I have had surgery in my hip joints bit lead a...,I give consent,Hindu
131,Female,34,"Chicago, IL",Having open communication with family and frie...,"I love my family, whatever they could do from ...",Never married,No,Between 35-40 Lakhs,Business Analyst,"Lewis university, Romeoville, IL",Masters,I came from a community where they have to che...,I give consent,Hindu


In [23]:
text_columns = ["I define privilege to be (having a supportive family, dependable friends, a job you love, anything else)",
 "About my growing up years  (they were awesome, they were challenging but all is good now, about the family, about friends, memories etc etc)",
 "Anything else that you may like to mention to the community members (any specifics which may be important to you)",
                        "How do I introduce the professional me (banker, IT professional, entrepreneur, environmentalist)",
                       "Name of the institute I graduated from last.", "Educational degree",

 ]
categorical_columns = ["My city of residence", "My current relationship status", "I have children",
                       'Ball park of my professional annual income']
numerical_columns = ['My age']

In [None]:
dataset['Anything else that you may like to mention to the community members (any specifics which may be important to you)']

In [26]:
age_weight = 3.0
relationship_status = 2.0
city_weight = 1.5



In [27]:
tfidf_vectorizers = {}
tfidf_features = []
for column in text_columns:
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(dataset[column])
    tfidf_vectorizers[column] = vectorizer
    tfidf_features.append(tfidf_matrix)



In [28]:
label_encoders = {}
encoded_categorical_features = []

for column in categorical_columns:
    le = LabelEncoder()
    encoded_column = le.fit_transform(dataset[column])
    label_encoders[column] = le
    if(column == 'My city of residence'):
        encoded_categorical_features.append(encoded_column.reshape(-1, 1)*city_weight)
    elif(column == 'My current relationship status'):
        encoded_categorical_features.append(encoded_column.reshape(-1, 1)*relationship_status)
    else:
        encoded_categorical_features.append(encoded_column.reshape(-1, 1))


In [29]:
scaler = StandardScaler()
scaled_numerical_features = scaler.fit_transform(dataset[numerical_columns])
scaled_numerical_features[:, 0] *= age_weight

In [30]:
features = hstack(tfidf_features + encoded_categorical_features + [scaled_numerical_features])

In [None]:
features

In [31]:
k = 80  # Number of neighbors
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
knn.fit(features)

In [36]:
current_row = 73
selectedRow = dataset.iloc[current_row]
selectedRowFeatures = features.getrow(current_row)
print(selectedRow.values)

['Male' 39 'Mumbai' 'Freedom of thoughts'
 'Lot of introspection, retrospection & corrections in life. Resolve & Evolve.'
 'Never married' 'No' 'Between 30-35 lakhs'
 "I have a professional swimming training academy. We have presence all over Mumbai & growing. I'm also into photography."
 'Mumbai University' 'High School' 'Life is Beautiful' 'I give consent' '']


In [37]:
distances, indices = knn.kneighbors(selectedRowFeatures, n_neighbors=k)
print(indices)

[[ 73 128 119   1 105  77 114 101  45  83 121  14 122  44 132  56  28  91
  102   0  21   4  53  67  82  46  43  35 117  66 123  49 125  42  37 127
   92 116  61  38  95 120  62 108  86  55  13  88  63  80  27  79  74 112
  130  94  31  60 118 113   6 103  20   3  23 126  32 107  78  12  59 124
  111 106  72  50  10  81  65  52]]


In [38]:
for i in indices[0]:
    if(dataset.iloc[i]['My Gender'] == selectedRow["My Gender"]):
        continue
    print(dataset.iloc[i].values)
    print("")
    print("")


['Female' 46 'Noida extension '
 'I have a very supportive family, amazing friends. I am passionate person and love my job . I like  to connect with people for a meaningful conversation. '
 'Growing up I had an amazing childhood loving parents and extended family. I had an amazing childhood friends in school and in the society. I loved playing badminton and table tennis. I used to participate in school cultural activities. Have a very fond memories from school days . '
 'Never married' 'No' 'Between 30-35 lakhs'
 'I did electronics engineering  from pune. I work as a program manager for an IT MNC . I was in USA close to 7yrs.i came to India during Covid in December 2020  '
 'Dr. DYPatil institute of engineering ' 'Undergraduate'
 "\n I am active, spiritual, positive, dreamer, easygoing yet motivated, intellectually curious and passionate about what I do. I enjoy both indoor and outdoor activities be it listening to music, reading or hanging out with friends, eating out or traveling. I 

In [39]:
joblib.dump(knn, 'v1_knn.joblib')

['v1_knn.joblib']

In [None]:
dataset.iloc[9]

In [None]:
tfidf_vectorizers = {
    "I define privilege to be (having a supportive family, dependable friends, a job you love, anything else)": TfidfVectorizer(),
    "About my growing up years  (they were awesome, they were challenging but all is good now, about the family, about friends, memories etc etc)": TfidfVectorizer(),
    "Anything else that you may like to mention (any specifics which may be important to you)": TfidfVectorizer()
}  # Dictionary of trained TF-IDF vectorizers
label_encoders = {
    "My Gender": LabelEncoder(),
    "My city of residence": LabelEncoder(),
    "My current relationship status": LabelEncoder(),
    "I have children": LabelEncoder(),
    "How do I introduce the professional me (banker, IT professional, entrepreneur, environmentalist)": LabelEncoder(),
    "Name of the institute I graduated from last.": LabelEncoder(),
    "Educational degree": LabelEncoder()
}  # Dictionary of trained LabelEncoders
scaler = StandardScaler()  # Trained StandardScaler

# Specify the columns
text_columns = [
    "I define privilege to be (having a supportive family, dependable friends, a job you love, anything else)",
    "About my growing up years  (they were awesome, they were challenging but all is good now, about the family, about friends, memories etc etc)",
    "Anything else that you may like to mention (any specifics which may be important to you)"
]
categorical_columns = [
    "My Gender", "My city of residence", "My current relationship status", "I have children",
    "How do I introduce the professional me (banker, IT professional, entrepreneur, environmentalist)",
    "Name of the institute I graduated from last.", "Educational degree", 'Ball park of my professional annual income'
]
numerical_columns = ['My age']
marital_status_column = 'My current relationship status'

def get_features_array(new_data):
    # Initialize lists to store features
    all_features = []
    
    for idx, new_row in new_data.iterrows():
        # Preprocess text columns
        tfidf_features = []
        for column in text_columns:
            tfidf_vectorizer = tfidf_vectorizers[column]
            tfidf_matrix = tfidf_vectorizer.transform([new_row[column]])
            tfidf_matrix *= text_weight  # Scale by text weight
            tfidf_features.append(tfidf_matrix)
        
        # Encode categorical variables
        encoded_categorical_features = []
        for column in categorical_columns:
            le = label_encoders[column]
            encoded_column = le.transform([new_row[column]])
            encoded_column = encoded_column.reshape(-1, 1) * categorical_weight  # Scale by categorical weight
            encoded_categorical_features.append(encoded_column)
        
        # Encode marital status
        le_marital_status = label_encoders[marital_status_column]
        encoded_marital_status = le_marital_status.transform([new_row[marital_status_column]])
        encoded_marital_status = encoded_marital_status.reshape(-1, 1) * marital_status_weight
        
        # Standardize numerical features
        numerical_data = np.array([[new_row[col] for col in numerical_columns]])
        scaled_numerical_features = scaler.transform(numerical_data)
        scaled_numerical_features[:, 0] *= age_weight  # Scale age
        scaled_numerical_features[:, 1] *= income_weight  # Scale income
        
        # Combine all features into a single feature matrix
        features = hstack(tfidf_features + encoded_categorical_features + [encoded_marital_status, scaled_numerical_features])
        all_features.append(features)
    
    # Stack all features
    all_features = hstack(all_features)
    
    return all_features


In [None]:
get_features_array(dataset.iloc[9])

In [None]:
dataset

In [None]:
def testingStuff(dataset):
    # label encoding
    le = LabelEncoder()
    dataset['My Gender'] = le.fit_transform(dataset['My Gender'])
    cityle = LabelEncoder()
    Dataset. list(cityle.fit_transform(dataset['My city of residence']))

    return dataset.head(5)



testingStuff(dataset)

In [40]:
data = dataset

text_columns = ["I define privilege to be (having a supportive family, dependable friends, a job you love, anything else)",
 "About my growing up years  (they were awesome, they were challenging but all is good now, about the family, about friends, memories etc etc)",
 "Anything else that you may like to mention to the community members (any specifics which may be important to you)"]
categorical_columns = ["My city of residence", "My current relationship status", "I have children",
                       "How do I introduce the professional me (banker, IT professional, entrepreneur, environmentalist)",
                       "Name of the institute I graduated from last.", "Educational degree",
                       'Ball park of my professional annual income']
numerical_columns = ['My age']


age_weight = 3.0
relationship_status = 2.0
city_weight = 1.5


# Vectorize text columns using TF-IDF
tfidf_vectorizers = {}
tfidf_features = []

for column in text_columns:
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(data[column])
    tfidf_vectorizers[column] = vectorizer
    tfidf_features.append(tfidf_matrix)

label_encoders = {}
encoded_categorical_features = []

for column in categorical_columns:
    le = LabelEncoder()
    encoded_column = le.fit_transform(dataset[column])
    label_encoders[column] = le
    if(column == 'My city of residence'):
        encoded_categorical_features.append(encoded_column.reshape(-1, 1)*city_weight)
    elif(column == 'My current relationship status'):
        encoded_categorical_features.append(encoded_column.reshape(-1, 1)*relationship_status)
    else:
        encoded_categorical_features.append(encoded_column.reshape(-1, 1))


scaler = StandardScaler()
scaled_numerical_features = scaler.fit_transform(dataset[numerical_columns])
scaled_numerical_features[:, 0] *= age_weight

# Combine all features into a single feature matrix
features = hstack(tfidf_features + encoded_categorical_features + [scaled_numerical_features])

print(features.getrow(9))

  (0, 2293)	-1.430730450405193
  (0, 2292)	1.0
  (0, 2291)	1.0
  (0, 2290)	8.0
  (0, 2289)	6.0
  (0, 2286)	3.0
  (0, 2231)	0.4423661466593878
  (0, 2207)	0.4423661466593878
  (0, 2201)	0.15885828560503912
  (0, 2056)	0.4423661466593878
  (0, 1920)	0.29401709549704774
  (0, 1842)	0.23325149356090902
  (0, 1767)	0.3003764753420119
  (0, 1597)	0.3642669340900622
  (0, 1495)	0.15469933438972563
  (0, 1457)	0.1482486547146791
  (0, 1397)	0.11104905969236596
  (0, 1371)	0.10989487174047914
  (0, 1366)	0.24212413749371647
  (0, 1346)	0.16140380621127012
  (0, 1303)	0.2793237325160296
  (0, 1273)	0.2793237325160296
  (0, 1214)	0.2793237325160296
  (0, 1196)	0.257563364385963
  (0, 1185)	0.257563364385963
  :	:
  (0, 305)	0.1261454380679655
  (0, 296)	0.13680289774524343
  (0, 261)	0.13680289774524343
  (0, 254)	0.07904988307982866
  (0, 253)	0.13680289774524343
  (0, 243)	0.13680289774524343
  (0, 240)	0.16768423890946424
  (0, 227)	0.13680289774524343
  (0, 215)	0.13680289774524343
  (0, 179)