In [1]:
import json
import pandas as pd
from tqdm import tqdm
from numpy import array, NaN
from tensorflow.keras.models import load_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from scipy.sparse import csr_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

## Crawl test data

In [None]:
directory=pd.read_csv("final_evaluation_data.csv")

df_data={
    'pair_id':[],
    'title1':[],
    'text1':[],
    'title2':[],
    'text2':[],
    'Overall':[],
    'Language1':[],
    'Language2':[]
}

In [None]:
for idx, i in tqdm(enumerate(directory['pair_id'])):
    id1, id2 = i.split('_')
    try:
        file1=json.load(open(f"eval_data\eval_data\{str(id1)[-2:]}\{id1}.json"))
        file2=json.load(open(f"eval_data\eval_data\{str(id2)[-2:]}\{id2}.json"))
        df_data['pair_id'].append(i)
        
        if file1['title'].strip() == '':
            df_data['title1'].append(NaN)
        else:
            df_data['title1'].append(file1['title'])

        if file1['text'].strip() == '':
            df_data['text1'].append(NaN)
        else:
            df_data['text1'].append(file1['text'])

        if str(file2['title'].strip()) == '':
            df_data['title2'].append(NaN)
        else:
            df_data['title2'].append(file2['title'])

        if file2['text'].strip() == '':
            df_data['text2'].append(NaN)
        else:
            df_data['text2'].append(file2['text'])
        
        df_data['Overall'].append(directory['Overall'][idx])
        df_data['Language1'].append(directory['url1_lang'][idx])
        df_data['Language2'].append(directory['url2_lang'][idx])
        
    except Exception:
        pass

4902it [06:09, 13.28it/s]


In [None]:
df=pd.DataFrame(df_data)
df.dropna(inplace=True)
df=df.reset_index()
df=df.drop(['index'], axis=1)
df.to_csv('test_data.csv')

## Preprocess test data

In [None]:
df=pd.read_csv('test_data.csv')

In [None]:
stop_words_eng = set(stopwords.words('english'))
stop_words_fr = set(stopwords.words('french'))
stop_words_es = set(stopwords.words('spanish'))
stop_words_tr = set(stopwords.words('turkish'))
stop_words_de = set(stopwords.words('german'))
stop_words_ar = set(stopwords.words('arabic'))
stop_words_it = set(stopwords.words('italian'))
stop_words_ru = set(stopwords.words('russian'))

stopwords={
    'en':stop_words_eng,
    'fr':stop_words_fr,
    'es':stop_words_es,
    'tr':stop_words_tr,
    'ar':stop_words_ar,
    'de':stop_words_de,
    'it':stop_words_it,
    'ru':stop_words_ru
}

In [None]:
for i in tqdm(range(len(df))):
    if df['Language1'][i]=='pl' or df['Language2'][i]=='pl' or df['Language1'][i]=='zh' or df['Language2'][i]=='zh':
        continue
    word_tokens_title1 = df['title1'][i]
    word_tokens_text1 = df['text1'][i]
    word_tokens_title2 = df['title2'][i]
    word_tokens_text2 = df['text2'][i]

    df['text1'][i] = ' '.join([w.lower() for w in word_tokens_title1.split() if not w.lower() in stopwords[df['Language1'][i]]])
    df['text1'][i] +=' '+' '.join([w.lower() for w in word_tokens_text1.split() if not w.lower() in stopwords[df['Language1'][i]]])
    df['text2'][i] = ' '.join([w.lower() for w in word_tokens_title2.split() if not w.lower() in stopwords[df['Language2'][i]]])
    df['text2'][i] +=' '+' '.join([w.lower() for w in word_tokens_text2.split() if not w.lower() in stopwords[df['Language2'][i]]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text1'][i] = ' '.join([w.lower() for w in word_tokens_title1.split() if not w.lower() in stopwords[df['Language1'][i]]])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text1'][i] +=' '+' '.join([w.lower() for w in word_tokens_text1.split() if not w.lower() in stopwords[df['Language1'][i]]])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text2'][i] = ' '.join([w.lower() for w in word_tokens_title2.split() if not w.lower() in stopwords[df['Language2'][i]]]

In [None]:
df=df.drop(['Unnamed: 0', 'title1', 'title2'], axis=1)
df=df.rename(columns={
    'text1': 'article1',
    'text2': 'article2'
})
df.to_csv('preprocessed_test.csv')

## Loading test data

In [None]:
train_data = pd.read_csv('preprocessed.csv')
train_data = train_data.dropna()
train_data = train_data.reset_index(drop=True)

In [None]:
test_data = pd.read_csv('preprocessed_test.csv')
test_data = test_data.dropna()
test_data = test_data.reset_index(drop=True)

In [None]:
X_train, X_test, y_test, y_train = [], [], [], [] # Preparing test data

In [None]:
for idx in range(len(train_data)):
  X_train.append(train_data['article1'][idx])
  X_train.append(train_data['article2'][idx])
  y_train.append(train_data['Overall'][idx])

In [None]:
for idx in range(len(test_data)):
  X_test.append(test_data['article1'][idx])
  X_test.append(test_data['article2'][idx])
  y_test.append(test_data['Overall'][idx])

## Tokenization

In [None]:
tf_vec = TfidfVectorizer(min_df =4) # Term frequency - inverse document frequency
x_train = tf_vec.fit(X_train)
x_test = tf_vec.transform(X_test)

In [None]:
X_test_art1 = []
X_test_art2 = []
X_test1 = []
X_test2 = []

for j in range(len(test_data)):
  art1 = 2*idx
  art2 = 2*idx+1
  X_test_art1.append(x_test[art1])
  X_test_art2.append(x_test[art2])

In [None]:
for i in range(len(test_data)):
  X_test1.append(array(csr_matrix.todense(X_test_art1[i])))
  X_test2.append(array(csr_matrix.todense(X_test_art2[i])))

In [None]:
y_test = array(y_test)


In [None]:
X_test1 = array(X_test1)
X_test2 = array(X_test2)

In [None]:
X_test1 = X_test1.reshape(len(test_data),X_test1.shape[2])
X_test2 = X_test2.reshape(len(test_data),X_test2.shape[2])

In [None]:
model=load_model("Saved_model")

In [None]:
predictions = model.predict([X_test1, X_test2])

In [None]:
mean_absolute_error(predictions, y_test)

1.0620849041219411