/
model_retrain.py
119 lines (101 loc) · 5.18 KB
/
model_retrain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
import numpy as np
import user_sim
import ast
import re
class model_retrain(object):
def __init__(self,final_dict,all_user_features):
model, res = self.model_retrain_test(final_dict)
model.to_csv('model_weights.csv')
res.to_csv('model_accuracy.csv')
self.recom = user_sim.recom_user()
self.user_sim_table = self.update_user_recom_table(all_user_features, model)
self.user_sim_table.to_csv('user_sim_table.csv')
def model_retrain_test(self,data):
"""Computes Logistic Regression to initially calculate the total similarity of each pair of users.
Independent variables of LR model are similarity scores of each feature, dependent variable is
match feature in dictionary calculated in user_sim.py.
Args: (dictionary) pair_wise dictionary calcualted in user_sim.py
Returns: (pandas.dataframe) model parameters, weight of each feature to calculate the total similarity
score in next step
(pandas.dataframe) model results
"""
for elem in data:
score = elem['sim_score']
key_list = score.keys()
for key in key_list:
elem[key] = score[key]
elem.pop('sim_score')
pair_df = pd.DataFrame(data)
col_name = list(pair_df.columns)[3:]
# Clean the pairwise data
process_pair_df = pair_df.dropna(thresh = 6)
process_pair_df = process_pair_df.fillna(0)
process_pair_df['match'] = pair_df['match'].astype(int)
x = process_pair_df.iloc[:, 3:].to_numpy()
y = np.array(process_pair_df['match'])
# Utilize LR model to get the weight of each similarity features
# Dependent variables are users' match
# Independent variables are users' similarity scores
model = LogisticRegression(solver='liblinear', multi_class='ovr')
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_validate(model, x, y, cv=kfold, scoring='accuracy', return_estimator=True)
cv_results = cv['test_score']
best_model = cv['estimator'][np.argmax(cv_results)]
# Generate results df
results_table = pd.DataFrame()
results_table['accuracy'] = cv_results
results_table['timestamp'] = pd.Timestamp.today()
best_model.fit(x,y)
model_para = np.absolute(best_model.coef_[0])
model_para = model_para / model_para.sum()
model_para = pd.DataFrame(zip(col_name, model_para), columns = ['features','weights'])
model_para['timestamp'] = pd.Timestamp.today()
return model_para, results_table
# Periodically update the recommendation score table
# Calculate total similarity score of each user by using the weights in LR model
def update_user_recom_table(self,data, model):
"""Computes algorithm to update the recommendation table.
Args: (pandas.dataframe) users features dataset
(pandas.dataframe) weights of each feature to calculate the total score
Returns: (pandas.dataframe) the recommendation table for all users
"""
results = []
count = 0
user_list = data.index.to_list()
weight = np.array(model['weights'])
for user1 in user_list:
sim_user = []
for user2 in user_list:
if user1 != user2:
count += 1
if count%100000 == 0:
print(count)
compare = self.recom.compare(data.loc[user1,:], data.loc[user2,:])
compare = {key:0 if str(value) in ['nan','None','NaN'] else value for key,value in compare.items()}
# Calculate total similarity score of each user by using the weights in LR model
score = np.dot(np.array(list(compare.values())),weight)
sim_user.append({'user': user1, 'sim_user': user2, 'sim_score':score})
# Users with top 10 total similarity score will be recommended
sim_user = sorted(sim_user, key = lambda x: x['sim_score'], reverse = True)[:10] # Could be 50
results.extend(sim_user)
results = pd.DataFrame(results)
return results
def clean_features(self,all_user_features):
# Clean the text data
all_user_features[['project_list']] = all_user_features[['project_list']].applymap(self.literal_return)
clean_list = ['company','job_title','education','major','university']
for l in clean_list:
all_user_features[l] = all_user_features[l].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)).lower())
return all_user_features
def literal_return(self,val):
# Turn the string into list syntax
try:
return ast.literal_eval(val)
except (ValueError, SyntaxError) as e:
return None
final_dict = pd.read_pickle('pairwise_compare_dict.pickle')
all_user_features = pd.read_csv('fake_all_user_features.csv',index_col=0)
retrain = model_retrain(final_dict, all_user_features)