In [1]:
import numpy as np, pandas as pd
import ast 
from sklearn import linear_model
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import warnings
warnings.filterwarnings('ignore')



In [2]:
data = pd.read_csv("train_detect_sent.csv")

In [9]:
data.shape

(85119, 12)

In [3]:
data.head(3)

Unnamed: 0,answer_start,context,question,text,sentences,quest_emb,target,sent_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc
0,515,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,"['Architecturally, the school has a Catholic c...",[[ 0.11010079 0.11422941 0.11560896 ... 0.0...,5,"[array([ 0.05519997, 0.05013141, 0.04787038,...","[0.424736299052452, 0.36405004106069117, 0.347...","[14.563858, 15.262212, 17.398178, 14.272491, 1...",5,5
1,188,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,"['Architecturally, the school has a Catholic c...",[[ 0.10951651 0.11030623 0.05210007 ... -0.0...,2,"[array([ 0.05519997, 0.05013141, 0.04787038,...","[0.45407456884452513, 0.32262004808444933, 0.3...","[12.889506, 12.285219, 16.843704, 8.361172, 11...",3,3
2,279,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,"['Architecturally, the school has a Catholic c...",[[ 0.01195647 0.14930707 0.02660049 ... 0.0...,3,"[array([ 0.05519997, 0.05013141, 0.04787038,...","[0.39585783692319865, 0.29170832145169434, 0.3...","[11.857297, 11.392319, 15.061656, 7.1847134, 8...",3,3


In [4]:
ast.literal_eval(data["sentences"][0])

['Architecturally, the school has a Catholic character.',
 "Atop the Main Building's gold dome is a golden statue of the Virgin Mary.",
 'Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".',
 'Next to the Main Building is the Basilica of the Sacred Heart.',
 'Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.',
 'It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.',
 'At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.']

In [5]:
data = data[data["sentences"].apply(lambda x: len(ast.literal_eval(x)))<11].reset_index(drop=True)

In [6]:
def create_features(data):
    train = pd.DataFrame()
     
    for k in range(len(data["euclidean_dis"])):
        dis = ast.literal_eval(data["euclidean_dis"][k])
        for i in range(len(dis)):
            train.loc[k, "column_euc_"+"%s"%i] = dis[i]
    
    print("Finished")
    
    for k in range(len(data["cosine_sim"])):
        dis = ast.literal_eval(data["cosine_sim"][k].replace("nan","10000"))
        for i in range(len(dis)):
            train.loc[k, "column_cos_"+"%s"%i] = dis[i]
            
    train["target"] = data["target"]
    return train

In [7]:
train = create_features(data)

Finished


In [10]:
del data

In [8]:
train.head(3)

Unnamed: 0,column_euc_0,column_euc_1,column_euc_2,column_euc_3,column_euc_4,column_euc_5,column_euc_6,column_euc_7,column_euc_8,column_euc_9,...,column_cos_1,column_cos_2,column_cos_3,column_cos_4,column_cos_5,column_cos_6,column_cos_7,column_cos_8,column_cos_9,target
0,14.563858,15.262212,17.398178,14.272491,13.339654,9.336262,15.720997,,,,...,0.36405,0.347755,0.394242,0.371025,0.18569,0.351921,,,,5
1,12.889506,12.285219,16.843704,8.361172,11.918098,17.601221,14.929258,,,,...,0.32262,0.355004,0.271561,0.392342,0.384383,0.362597,,,,2
2,11.857297,11.392319,15.061656,7.184713,8.465475,13.927309,12.249868,,,,...,0.291708,0.309919,0.223061,0.265975,0.293025,0.288711,,,,3


In [11]:
train.fillna(10000, inplace=True)

In [12]:
train.head(3).transpose()

Unnamed: 0,0,1,2
column_euc_0,14.563858,12.889506,11.857297
column_euc_1,15.262212,12.285219,11.392319
column_euc_2,17.398178,16.843704,15.061656
column_euc_3,14.272491,8.361172,7.184713
column_euc_4,13.339654,11.918098,8.465475
column_euc_5,9.336262,17.601221,13.927309
column_euc_6,15.720997,14.929258,12.249868
column_euc_7,10000.0,10000.0,10000.0
column_euc_8,10000.0,10000.0,10000.0
column_euc_9,10000.0,10000.0,10000.0


### Fitting Multinomial Logistic Regression

In [13]:
train_x, test_x, train_y, test_y = train_test_split(train.iloc[:,:-1],
train.iloc[:,-1], train_size=0.9, random_state = 5)

In [14]:
# Train multinomial logistic regression model
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg', C=100)
mul_lr.fit(train_x, train_y)

print("Multinomial Logistic regression Train Accuracy : ", metrics.accuracy_score(train_y, mul_lr.predict(train_x)))
print("Multinomial Logistic regression Test Accuracy : ", metrics.accuracy_score(test_y, mul_lr.predict(test_x)))


Multinomial Logistic regression Train Accuracy :  0.44610805801036457
Multinomial Logistic regression Test Accuracy :  0.4538298872180451
