# Importing the libraries

In [15]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Loading the data

In [16]:
traindf = pd.read_csv("../input/nlp-hackathon/TrainData.txt" ,  sep = '\t')
traindf = traindf.loc[:, ::-1]
traindf.head()

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(traindf, test_size=0.7, random_state=42)

In [18]:
X_train.head()

In [19]:
print(X_train.shape)
print(X_test.shape)

In [20]:
testdf = pd.read_csv("../input/nlp-hackathon/TestData.csv")
testdf.head()

In [21]:
testdf.shape

# Building the model

In [22]:
X_train.rename(columns={'Question': "input_text",
                   'Answer': "target_text"},
          inplace=True, errors='raise')

In [23]:
X_train.head(2)

In [24]:
X_test.rename(columns={'Question': "input_text",
                   'Answer': "target_text"},
          inplace=True, errors='raise')
X_test.head(2)

In [25]:
!pip install simpletransformers

In [26]:
import logging

import pandas as pd
from simpletransformers.seq2seq import (
    Seq2SeqModel,
    Seq2SeqArgs,
)


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

train_df = pd.DataFrame(
    X_train, columns=["input_text","target_text" ]
)

eval_df = pd.DataFrame(
    X_test, columns=["input_text","target_text"]
)

model_args = Seq2SeqArgs()
model_args.num_train_epochs = 25
model_args.no_save = True
model_args.evaluate_generated_text = True
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True
model_args.overwrite_output_dir = True
# Initialize model
model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-large",
    args=model_args,
    use_cuda=True,
)


# Train the model
model.train_model(
    train_df, eval_data=eval_df
)

# # Evaluate the model
results = model.eval_model(eval_df)

# Use the model for prediction
print(
    model.predict(
        [
            "i'll give you a speech like that, too."
        ]
    )
)


In [27]:
test = testdf['Question'].tolist()
print(test)

In [28]:
y_pred = model.predict(test)
print(y_pred)

# Metrics

In [29]:
y_true = X_test['target_text'].tolist()
evaldata = pd.DataFrame()
evaldata['true'] = y_true
y_predeval = model.predict(X_test['input_text'].tolist())

In [30]:
evaldata['pred'] = y_predeval
evaldata.head()

In [31]:
from fuzzywuzzy import fuzz
for index, row in evaldata.iterrows():
    Token_Set_Ratio = fuzz.token_set_ratio(row['true'],row['pred'])
    tokensetratio = []
    tokensetratio.append(Token_Set_Ratio)
    
print('Average token set ratio', sum(tokensetratio)/len(tokensetratio))
print()
print(tokensetratio)

In [32]:
test = testdf['Question'].tolist()
testdf = testdf.drop(['Question'], axis = 1)
testdf['Answer'] = model.predict(test)
testdf.head()

In [34]:
testdf.to_csv('Eric_1.csv', index = False)