In [1]:
import pandas as pd
import seaborn as sb
from random import shuffle
import numpy as np
import pickle
import matplotlib.pyplot as plt
%matplotlib inline 
import os

In [2]:
# For writing objects to files
def save_object(obj, filename):
    with open(filename, 'wb') as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
    return

def load_object(filename):
    with open(filename, 'rb') as input:
        obj=pickle.load(input)
    return obj

In [3]:
#reading the csv files
df_prb=pd.read_csv("train/problems.csv")
df_sub=pd.read_csv("train/submissions.csv")
df_usr=pd.read_csv("train/users.csv")

In [4]:
#group by user_id and problem_id to see submission status
cols=['user_id','problem_id']
df_grp=df_sub.groupby(cols)

In [5]:
grp_ids=df_grp.groups

In [6]:
user_problem_solved={}

In [7]:
#target labels PAC=-1 and AC=+1
df_sub['result'].unique()

array(['PAC', 'AC'], dtype=object)

In [8]:
#sample user_id-problem_id pair
df_grp.get_group( (1178898, 926073))

Unnamed: 0,user_id,problem_id,solved_status,result,language_used,execution_time
159163,1178898,926073,AT,PAC,C++,0.6692
159164,1178898,926073,SO,PAC,C++,0.6883
159165,1178898,926073,SO,PAC,C++,0.7167


In [9]:
#Computing the (user_id,problem_id,solved) dictionary where (user_id,problem_id) is the key solved(+1) or not solved(-1) is the value
#Uncomment to recompute user_problem_label object
"""user_problem_label={}
#test_grp=[(967552,909306),(1178898,926073),(1178898,926073),(1037442,916711),(1130935,913129)]
itr=0
for key in grp_ids:
#for key in test_grp:
    print "\ruser "+str(itr)+":"+str(key[0]),
    itr+=1
    tmp=df_grp.get_group(key)
    #check whether user has got the problem accepted
    if len(tmp[tmp['result']=='AC'])==0: # partially accepted
        user_problem_label[key]= -1
    else: 
         user_problem_label[key]= +1 #accepted"""



#creating a new csv file storing (user_id,problem_id,label) where label says solved or not solved
#uncomment to rewrite the csv file
"""f = open('train/user_problem_labels.csv','w')
f.write("user_id,problem_id,label\n")
for key in user_problem_label:
    to_write=str(key[0])+','+str(key[1])+','+str(user_problem_label[key])  
    f.write(to_write+'\n')    
f.close()"""

'user_problem_label={}\n#test_grp=[(967552,909306),(1178898,926073),(1178898,926073),(1037442,916711),(1130935,913129)]\nitr=0\nfor key in grp_ids:\n#for key in test_grp:\n    print "\ruser "+str(itr)+":"+str(key[0]),\n    itr+=1\n    tmp=df_grp.get_group(key)\n    #check whether user has got the problem accepted\n    if len(tmp[tmp[\'result\']==\'AC\'])==0: # partially accepted\n        user_problem_label[key]= -1\n    else: \n         user_problem_label[key]= +1 #accepted'

In [12]:
#reading the custom made csv file
df_usr_prb=pd.read_csv("train/user_problem_labels.csv")

In [13]:
df_usr_prb.head()

Unnamed: 0,user_id,problem_id,label
0,1037442,916711,1
1,967552,909306,1
2,1178898,926073,-1
3,1327831,925526,1
4,923365,907591,-1


In [14]:
#joining the train dataframes
df = pd.merge(df_prb, df_usr_prb, how='inner', on=['problem_id'])
df_train = pd.merge(df_usr, df, how='inner', on=['user_id'])

In [15]:
#reading the test data
df_tst_prb=pd.read_csv("test/problems.csv")
df_tst_usr=pd.read_csv("test/users.csv")
df_pred=pd.read_csv("test/test.csv")

In [16]:
#joining the test dataframes
df_t = pd.merge(df_tst_prb, df_pred, how='inner', on=['problem_id'])
df_tst = pd.merge(df_tst_usr, df_t, how='inner', on=['user_id'])

In [17]:
df_tst.head(2)

Unnamed: 0,user_id,skills,solved_count_x,attempts,user_type,problem_id,level,accuracy,solved_count_y,error_count,rating,tag1,tag2,tag3,tag4,tag5,Id
0,1444303,Python,0,5,W,940002,E,0.42,63,371,3.7,Ad-Hoc,Basic Programming,Implementation,,,14425
1,1444303,Python,0,5,W,940003,E,0.32,45,396,3.9,Prime Factorization,Math,,,,22576


In [18]:
df_train.head(2)

Unnamed: 0,user_id,skills,solved_count_x,attempts,user_type,problem_id,level,accuracy,solved_count_y,error_count,rating,tag1,tag2,tag3,tag4,tag5,label
0,1427919,C++,0,11,W,913736,M,0.21,524,7868,4.0,Ad-Hoc,Dynamic Programming,Algorithms,,,-1
1,1034704,C,3,11,,906741,E,0.87,503,443,3.8,Ad-Hoc,Data Structures,Math,,,-1


In [19]:
df_train.columns

Index([u'user_id', u'skills', u'solved_count_x', u'attempts', u'user_type',
       u'problem_id', u'level', u'accuracy', u'solved_count_y', u'error_count',
       u'rating', u'tag1', u'tag2', u'tag3', u'tag4', u'tag5', u'label'],
      dtype='object')

In [20]:
df_tst.columns

Index([u'user_id', u'skills', u'solved_count_x', u'attempts', u'user_type',
       u'problem_id', u'level', u'accuracy', u'solved_count_y', u'error_count',
       u'rating', u'tag1', u'tag2', u'tag3', u'tag4', u'tag5', u'Id'],
      dtype='object')

In [21]:
#features I have decided to consider for now
features=['solved_count_x','attempts','user_type','level','accuracy','solved_count_y','error_count','rating','tag1','tag2']

In [22]:
#filling empty cell in both test and train using forward fill approach.
# could have tried mode too..!!
for col in features:
    df_train[col]=df_train[col].fillna(method='ffill')
    df_tst[col]=df_tst[col].fillna(method='ffill')

In [23]:
#df_train.head(10)
#Identiying categorical inputs
categorical=['user_type','level','tag1','tag2']

In [24]:
df_train.describe()

Unnamed: 0,user_id,solved_count_x,attempts,problem_id,accuracy,solved_count_y,error_count,rating,label
count,421975.0,421975.0,421975.0,421975.0,421975.0,421975.0,421975.0,421975.0,421975.0
mean,1075648.0,56.144518,64.765586,915284.705651,0.639341,1084.1139,1557.249856,3.145151,0.072528
std,136022.2,83.183899,110.757002,8908.973205,0.232088,1744.103088,2106.635717,1.400578,0.997368
min,903633.0,0.0,0.0,903637.0,0.03,0.0,0.0,0.0,-1.0
25%,962742.0,8.0,11.0,908159.0,0.46,139.0,298.0,3.0,-1.0
50%,1037837.0,26.0,31.0,913737.0,0.7,388.0,767.0,3.6,1.0
75%,1158876.0,68.0,75.0,920127.0,0.84,1216.0,1850.0,4.0,1.0
max,1444693.0,789.0,7211.0,937962.0,1.0,8922.0,13346.0,5.0,1.0


In [25]:
#Label encoding categorical inputs so that classifier can use them
from sklearn.preprocessing import LabelEncoder

In [26]:
for col in categorical:
    enc = LabelEncoder()
    enc.fit(df_train[col].append(df_tst[col]))
    modified_train_col=enc.transform(df_train[col])
    modified_tst_col=enc.transform(df_tst[col])
    df_train[col+' enc']=modified_train_col
    df_tst[col+' enc']=modified_tst_col

In [27]:
#processed feature columns
features=['solved_count_x','attempts','user_type enc','level enc','accuracy','solved_count_y','error_count','rating','tag1 enc','tag2 enc']

In [53]:
#df_train[features].head()

In [29]:
######## CLASSIFICATION #### 


#from sklearn.ensemble import ExtraTreesClassifier
#from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
#from sklearn.tree import DecisionTreeClassifier


In [30]:
model=RandomForestClassifier(n_estimators=150)
#model=BaggingClassifier(n_estimators=150)
#model=DecisionTreeClassifier()
#model=ExtraTreesClassifier(n_estimators=150)


In [31]:
print(":::learning the model::: ")

model.fit(df_train[features],df_train['label'])

:::learning the model::: 


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [48]:
print(":::predicting for test data::: ")
predictions=model.predict(df_tst[features])
df_tst['prediction']=predictions

#This loops to re-order the predictions w.r.t to test.csv
d={}
for ii,row in df_tst.iterrows():
    d[(row['user_id'],row['problem_id'])]=row['prediction']
t_usr=[]
t_prb=[]
t_pred=[]
for ii,row in df_pred.iterrows():
    t_usr.append(row['user_id'])
    t_prb.append(row['problem_id'])
    t_pred.append(d[(row['user_id'],row['problem_id'])])
    
result = pd.DataFrame({        
        "prediction":t_pred,        
        "problem_id": t_prb,
        "user_id":t_usr,
    })
result.to_csv("test/predictions.csv", index=False,columns=['user_id','problem_id','prediction'])

In [35]:
#Computing training error
train_predictions=model.predict(df_train[features])
df_train['prediction']=train_predictions

In [37]:
df_train[['label','prediction']].head(10)

Unnamed: 0,label,prediction
0,-1,-1
1,-1,-1
2,-1,-1
3,-1,-1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


In [44]:
#Finding accuracy
df_train['correct_prediction']=(df_train['label']==df_train['prediction'])

In [45]:
df_train[['label','prediction','correct_prediction']].head(10)

Unnamed: 0,label,prediction,correct_prediction
0,-1,-1,True
1,-1,-1,True
2,-1,-1,True
3,-1,-1,True
4,1,1,True
5,1,1,True
6,1,1,True
7,1,1,True
8,1,1,True
9,1,1,True


In [46]:
accuracy=sum(df_train['correct_prediction'])*100/float(len(df_train))

In [50]:
print("Training accuracy (%)",accuracy)

('Training accuracy (%)', 97.493690384501448)


In [56]:
result = pd.DataFrame({        
        "prediction":t_pred,        
    })
result.to_csv("test/predictions_labels.csv", index=False,columns=['user_id','problem_id','prediction'])