In [13]:
from tqdm.notebook import tqdm
import random

In [14]:
def get_product_data(filename=  'data_pos_neg_neu.txt',
                     product_map_file = 'product_id_map.txt'):
    '''
    read the list of dict for postive,negative,neutral feedback
    output:
        data: list of dict , user product interaction
        mapping: mapping between product id and product name
    '''
    data = []
    with open(filename) as f:
        for i, line in enumerate(f):
            try:
                d = eval(line)
            except:
                print("*****Error*****")
                print(i,line)
                print("*********END of ERROR *******")
                break
            else:
                data.append(d)
    with open(product_map_file, 'r', encoding='utf-8') as map_file:
        product_id_map = eval(map_file.readline())
    return data, product_id_map
def get_user_data(user_data_file = 'data_userInfo.txt',
              user_data_map_file = 'user_data_map.txt'):
    '''
    Load 1. user info data 
        2. corresponding mapping relation
         supporting corresponding transformation
    '''
    data_userInfo =[]
    with open(user_data_file, 'r', encoding='utf-8') as input_file:
        for line in tqdm(input_file):
            data_userInfo.append(eval(line))
    user_data_map ={}
    with open(user_data_map_file, 'r', encoding='utf-8') as input_file:
        for line in input_file:
            key,dic = line.split('#')
            user_data_map[key] = eval(dic)
    return data_userInfo,user_data_map

In [15]:
def save_user_map(user_map,file_name = 'user_data_map.txt'):
    import json
    with open(file_name, 'w', encoding='utf-8') as output_file:
        for key in tqdm(user_map):
            try:
                output_file.write(f'{key}#')
                json.dump(user_data_map[key], output_file) 
                output_file.write("\n")
            except:
                print("ERROR")
                print(f"key: {key}, values: {user_data_map[key]}")
                break
        print('done!')

In [16]:
 def transform_interaction_data(data):
    '''
    read in the postive, negative,neutral data
    perform one of the following transformation
        if four_deg:
          4 degree "one-hot encoding" (0:missing,1:Neutral, 5:positive,-5 negative)
        else:
          user-product interact 
    '''
    transformed = []
    for i,sample in enumerate(data):
        user_id = i+1
        for p in sample['pos']:
            transformed.append([user_id,product_id_map[p],5])
        for neg in sample['neg']:
            transformed.append([user_id,product_id_map[neg],-5])
        for neu in sample['neu']:
            transformed.append([user_id,product_id_map[neu],1])
    return transformed

In [17]:
def transform_userInfo(sample,tran_map,exclude_list= [],range_map = False):
    selected = list(tran_map.keys())
    for ele in exclude_list:
        selected.remove(ele)
    feature = []
    feature_range_map = {}
    last_size = 0
    for cat in selected:
        if cat == 'ConvertedCompYearly' or cat == 'CompTotal':
            value = sample[cat]
            assert isinstance(value,float) or isinstance(value,int)
            if value == -1: # missing value
                value = -1
            elif value< tran_map[cat]['min']:
                value = tran_map[cat]['min']
            elif value> tran_map[cat]['max']:
                value = tran_map[cat]['max']
            feature.append(value)
        elif cat == 'YearsCode' or cat == 'YearsCodePro':
            value = sample[cat][0] if isinstance(sample[cat],list) else sample[cat]
            if value == -1:
                value =-1
            elif value == 'Less than 1 year':
                value = tran_map[cat]['min']
            elif value == 'More than 50 years' or int(value)> tran_map[cat]['max']:
                value= tran_map[cat]['max']
            feature.append(int(value))
        else:
            one_hot = [0] * len(tran_map[cat])
            if sample[cat] != -1: # not missing
                for ans in sample[cat]:
                    one_hot[tran_map[cat][ans]] =1
            feature.extend(one_hot)
        if range_map:
            feature_range_map[cat] = [last_size,len(feature)]
            last_size =  len(feature)
    if range_map:
        return feature, feature_range_map
    return feature

In [18]:
#sample_fea,range_map =transform_userInfo(user_sample,user_map,range_map = True)
def check_user_feature(sample_fea,user_sample,range_map):
    for key in range_map:
        start,end = range_map[key]
        print(f"key: {key},feature:{sample_fea[start:end]},true_val: {user_sample[key]}")

In [19]:
def transform_product_modOneHot(sample):
    product_feedback = [0] * 131 # len(product_id_map) =131
    for p in sample['pos']:
        product_feedback[product_id_map[p]] = 5
    for neg in sample['neg']:
        product_feedback[product_id_map[neg]] = -5
    for neu in sample['neu']:
        product_feedback[product_id_map[neu]] = 1
    return product_feedback

def transform_user_product_data(user_data,product_data,tran_map,exclude_list=[]):
    '''
    read in the postive, negative,neutral data
    perform one of the following transformation
        if four_deg:
          4 degree "one-hot encoding" (0:missing,1:Neutral, 5:positive,-5 negative)
        else:
          user-product interact 
    '''
    feature = []
    label =[]
    sample_fea,range_map = transform_userInfo(user_data[0],tran_map,range_map = True,exclude_list= exclude_list)
    feature.append(sample_fea)
    for sample in user_data[1:]:
        sample_fea= transform_userInfo(sample,tran_map,range_map = False,exclude_list=exclude_list)
        feature.append(sample_fea)
    for sample in product_data:
        label.append(transform_product_modOneHot(sample))
    return feature,label,range_map

In [20]:
# test transform_user_product_data
# impletement KNN

In [21]:
user_data,user_map = get_user_data()
product_data,product_id_map = get_product_data()
transformed_feature,label,range_map = transform_user_product_data(user_data,product_data,user_map,exclude_list= ['CompTotal','ConvertedCompYearly','CompFreq','Currency'])

0it [00:00, ?it/s]

In [22]:
'''ind  = random.randrange(len(transformed_feature))
print(ind)
check_user_feature(transformed_feature[ind],user_data[ind],range_map)
print(f'label: {label[ind]},product_data: {product_data[ind]}')'''

"ind  = random.randrange(len(transformed_feature))\nprint(ind)\ncheck_user_feature(transformed_feature[ind],user_data[ind],range_map)\nprint(f'label: {label[ind]},product_data: {product_data[ind]}')"

In [23]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

In [24]:
X_train, X_test, y_train, y_test = train_test_split(transformed_feature,
                                                   label,
                                                   test_size = 0.2,
                                                   random_state = 10)

In [150]:
 def distance(clf, X, y):
    y_pred = clf.predict(X)
    distance = -np.sum(np.absolute(y-y_pred))/ len(X)
    return distance
parameters = {'n_neighbors':[1, 3,5,7,11]}
knn = KNeighborsClassifier()
clf = GridSearchCV(knn, parameters,scoring = distance)
clf.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 3, 5, 7, 11]},
             scoring=<function distance at 0x7f2cecd08e50>)

In [156]:
clf.best_params_

{'n_neighbors': 11}

In [29]:
scores = cross_validate(clf, X_train, y_train,scoring =distance,n_jobs =2)
for key, values in scores.items():
        print(key,' mean ', values.mean())
        print(key,' std ', values.std())

fit_time  mean  3.26820387840271
fit_time  std  0.06709007741107274
score_time  mean  48.05591616630554
score_time  std  2.410340245167876
test_score  mean  -122.41976311724045
test_score  std  0.3404165230225632


In [153]:
y_pred = clf.predict(X_test)

In [29]:
def average_metricsk(y_pred,y_true,product_id_map,k =5):
    id_product = {v:k for k,v in product_id_map.items()}
    def get_sample_topk_rec(sample):
        value_product = []
        for ind in range(len(sample)):
            if sample[ind] !=0:
                value_product.append((sample[ind],id_product[ind]))
        return sorted(value_product,reverse= True)
    def metricsk(rec1,true_rec,k = 5):
        if k > len(true_rec) or len(rec1)==0:
            return -1,0,0,0
        rec_set = set()
        true_set = set()
        for count,i1 in enumerate(rec1):
            if count >= k:
                break
            rec_set.add(i1[1])
        for count,i2 in enumerate(true_rec):
            if count >= k:
                break
            true_set.add(i2[1])
        inter =rec_set.intersection(true_set)
        precision_k = len(inter) / k
        recall_k = len(inter) / len(true_rec)
        F1 = 0 if precision_k + recall_k ==0 else (2 * precision_k * recall_k) / (precision_k + recall_k) 
#         print(inter,len(inter) / k)
        return precision_k,recall_k ,F1, inter
    precison_sum = 0
    recall_sum = 0
    F1_sum =0
    count = 0
    correct_item = list()
    for s1,s2 in zip(y_pred,y_true):
        top_rec_pred = get_sample_topk_rec(s1)
        top_rec_true = get_sample_topk_rec(s2)
        precision,recall,F1,sample_rec_set =metricsk(top_rec_pred,top_rec_true,k)
        if precision == -1:continue
        correct_item.extend(sample_rec_set)
        precison_sum += precision
        recall_sum +=recall
        F1_sum += F1
        count +=1
    return precison_sum /count, recall_sum/count, F1_sum/count, correct_item

In [154]:
aveg_precision,avg_recall,avg_F1, correct = average_metricsk(y_pred,y_test,product_id_map)
aveg_precision,avg_recall,avg_F1

(0.22101191207170284, 0.0642974223907517, 0.09498123139742587)

In [155]:
Counter(correct)

Counter({'Visual Studio Code': 8295,
         'React.js': 598,
         'Windows': 558,
         'Git': 1885,
         'TypeScript': 958,
         'Python': 1874,
         'JavaScript': 789,
         'SQL': 1025,
         'Sublime Text': 13,
         'MacOS': 67,
         'SQLite': 49,
         'Rust': 24,
         'Java': 68,
         'Node.js': 159,
         'HTML/CSS': 190,
         'PostgreSQL': 386,
         'C++': 29,
         'Visual Studio': 220,
         'Xcode': 7,
         'IntelliJ': 62,
         'Yarn': 14,
         'NumPy': 58,
         'Docker': 115,
         'Redis': 63,
         'Kubernetes': 27,
         'Pandas': 39,
         'MySQL': 154,
         'jQuery': 13,
         'Vim': 63,
         'MongoDB': 26,
         'TensorFlow': 6,
         'Microsoft SQL Server': 17,
         'Notepad++': 34,
         'PyCharm': 9,
         'Swift': 2,
         'Microsoft Azure': 1,
         'Kotlin': 5,
         'Android Studio': 4,
         'Spring': 5,
         'Vue.js': 19,
     

In [157]:
from sklearn.tree import DecisionTreeClassifier

In [158]:
dtclf = DecisionTreeClassifier()
dtclf.fit(X_train, y_train)

DecisionTreeClassifier()

In [160]:
y_pred = dtclf.predict(X_test)
aveg_precision,avg_recall,avg_F1, correct = average_metricsk(y_pred,y_test,product_id_map)
aveg_precision,avg_recall,avg_F1

In [162]:
Counter(correct)

Counter({'Vim': 642,
         'Visual Studio Code': 6163,
         'PostgreSQL': 280,
         'C++': 34,
         'Visual Studio': 945,
         'Yarn': 341,
         'TypeScript': 1398,
         'SQL': 1330,
         'Python': 1413,
         'JavaScript': 190,
         'Xcode': 236,
         'Redis': 207,
         'Node.js': 139,
         'Sublime Text': 161,
         'MongoDB': 44,
         'Vue.js': 284,
         'Webstorm': 77,
         'React.js': 590,
         'jQuery': 297,
         'Windows': 283,
         'PHP': 42,
         'MySQL': 93,
         'SQLite': 367,
         'Torch/PyTorch': 114,
         'Linux-based': 24,
         'Terraform': 150,
         'Julia': 8,
         'PyCharm': 110,
         'TensorFlow': 135,
         'Kubernetes': 47,
         'Notepad++': 150,
         'NumPy': 74,
         'Git': 459,
         'Unity 3D': 87,
         'RStudio': 27,
         'R': 30,
         'Rust': 112,
         'Xamarin': 13,
         'Pandas': 91,
         'Ruby': 28,
        

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
rfclf = RandomForestClassifier(max_depth =20)
rfclf.fit(X_train, y_train)

RandomForestClassifier(max_depth=20)

In [30]:
y_pred = rfclf.predict(X_test)
aveg_precision,avg_recall,avg_F1, correct = average_metricsk(y_pred,y_test,product_id_map)
aveg_precision,avg_recall,avg_F1

(0.22949274918923276, 0.06759622174829218, 0.09954696894555772)