In [2]:
import pandas as pd
import json
import re
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import csr_matrix


# generate x_train and y_train from train data
# note that for instances containing multiple authors, generate one x_train for each author(label)
def convert_train_data(data):
    x_array=[]   #[{"venue":value},{"keywords":value},{"year":value},{"coauthors":value}]
    y_array=[]   #[author name]
    for id,instance in data.items(): #for each instance
        for i in range(len(instance["author"])):
            instance_dict=get_instance_dict(instance,instance["author"][i]) 
            x_array.append(instance_dict)
            y_array.append(instance["author"][i])
    return x_array,y_array   #,targets




# internal method for convert_train_data
# note that the coauthor feature is not used (commented out)
def get_instance_dict(instance,author): #exclude this author from coauthor list
    instance_dict={}
    for name, value in instance.items():
        # for features in train/test data
        if name=="year" or name=="venue":  
            if value =="":  #avoid empty value
                instance_dict[name]= -1   #TODO too large may cause normalization problem
            else:
                instance_dict[name]=value
        if name=="keywords":
            instance_dict[name]=[str(x) for x in value]   

    return instance_dict
                

def convert_test_data(data):
    x_array=[]   #[{feature name:value},{feature name:value}]
    coauthors=[]  # [[label]]
    targets=[]
    for id,instance in data.items(): #for each instance
        instance_dict={}
        for name, value in instance.items():
            # for features in train/test data
            if name=="year" or name=="venue":  
                if value =="": 
                    instance_dict[name]= -1   
                else:
                    instance_dict[name]=value
            if name=="keywords":
                instance_dict[name]=[str(x) for x in value]  

            # coauthor(s) for test data
            if name=="coauthor":    
                coauthors.append(value)
            # target for test data
            if name=="target":
                targets.append(value)
        x_array.append(instance_dict)
        
    return x_array,coauthors,targets


In [4]:
vec = DictVectorizer()  

# train data
train_file_path="train.json"
train_data = json.load(open(train_file_path, "r"))  #dict

x_train,y_train= convert_train_data(train_data) 
#convert to [500-keyword,venue,year] vector (dimension= 502)
x_train= vec.fit_transform(x_train).toarray()  


#test data
test_file_path="test.json"
test_data = json.load(open(test_file_path, "r"))  #dict

x_test,coauthors,targets= convert_test_data(test_data)  
x_test= vec.transform(x_test).toarray() 

# Since SVMs incorporate a penalty term for the weights (proportional to ‖𝐰‖2 ), 
# standardise features so that each feature has zero mean/unit variance.
scaler = StandardScaler(with_mean=False)# with_mean-center data before scaling-->not work on sparse matrix
x_train = scaler.fit_transform(x_train) 
x_test = scaler.transform(x_test)

In [None]:
# # split train data into train and dev data
# # grid search for SVC to find optimal set of parameters

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

x_tr, x_dev, y_tr, y_dev = train_test_split(x_train, y_train, test_size=0.2, random_state=52)

param_grid = {'kernel':['linear','rbf'],  
              'gamma':['scale', 'auto'],  
              'C':[0.5,1.0,1.5]
             }

svc = SVC()
gs = GridSearchCV(estimator=svc,
                  param_grid=param_grid,
                  scoring='accuracy', 
                  n_jobs= 2,   
                  verbose=3,
                  cv=2)
gs.fit(x_tr, y_tr)
best_params = gs.best_params_
print("Best parameters for grid search are ", best_params)

In [4]:
# make predictions using the optimal parameter set
svc = SVC(kernel="linear",C=0.6, class_weight='balanced', probability=True) #linear kernel
model = svc.fit(x_train, y_train)  #x_train_real
prob_distribution= model.predict_proba(x_test) #predicted prob distribution over each author

import csv
prediction_file="test_submission.csv"
f = open(prediction_file, 'w')
writer = csv.writer(f)

prob_list=[]
for i in range(len(x_test)):
    author_index=targets[i]
    prob_list.append([i,prob_distribution[i][author_index]])
# print(prob_list)

writer.writerow(["Id","Predicted"])
for line in prob_list:
    writer.writerow(line)
    
f.close()

In [9]:
# CalibratedClassifierCV with LinearSVC

import csv
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC   

# Prefer dual=False when n_samples > n_features.
svm = LinearSVC(penalty="l1",dual=False) #linear kernel   # penalty{‘l1’, ‘l2’}, default=’l2’
clf = CalibratedClassifierCV(svm) 

clf.fit(x_train, y_train)  #x_train_real
prob_distribution= clf.predict_proba(x_test) #predicted prob distribution over each author

prediction_file="test_submission7.csv"
f = open(prediction_file, 'w')
writer = csv.writer(f)

prob_list=[]
for i in range(len(x_test)):
    author_index=targets[i]
    prob_list.append([i,prob_distribution[i][author_index]])

writer.writerow(["Id","Predicted"])
for line in prob_list:
    writer.writerow(line)
    
f.close()