In [33]:
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
 
warnings.filterwarnings(action = 'ignore')
 
import gensim
from gensim.models import Word2Vec
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
 

In [7]:
df = pd.read_csv("Training-dataset.csv")
df['text'] = df['title'] + ' ' + df['plot_synopsis']
training_data = df[['text', 'comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']]
training_data.head() 

Unnamed: 0,text,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,Si wang ta After a recent amount of challenges...,0,0,0,0,1,1,0,0,1
1,Shattered Vengeance In the crime-ridden city o...,0,0,0,0,1,1,1,0,1
2,L'esorciccio Lankester Merrin is a veteran Cat...,0,1,0,0,0,0,0,0,0
3,"Serendipity Through Seasons ""Serendipity Throu...",0,0,0,0,0,0,1,0,0
4,The Liability Young and naive 19-year-old slac...,0,0,1,0,0,0,0,0,0


In [8]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

stop_words = set(stopwords.words('english') + ['reuter', '\x03'])
lemmatizer = WordNetLemmatizer()
# stemmer = PorterStemmer()

def preprocessor(text: str):
    text = text.lower()

    table = str.maketrans('', '', string.punctuation)
    text = text.translate(table)

    text = re.sub(r'\d+', 'num', text)

    text = [word for word in text.split() if word not in stop_words]

    text = [lemmatizer.lemmatize(word) for word in text]
    
    # text = [stemmer.stem(word) for word in text]

    return " ".join(text)


In [11]:
training_data['preprocessed_text'] = training_data['text'].apply(preprocessor)
training_data.head()

Unnamed: 0,text,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence,preprocessed_text
0,Si wang ta After a recent amount of challenges...,0,0,0,0,1,1,0,0,1,si wang ta recent amount challenge billy lo br...
1,Shattered Vengeance In the crime-ridden city o...,0,0,0,0,1,1,1,0,1,shattered vengeance crimeridden city tremont r...
2,L'esorciccio Lankester Merrin is a veteran Cat...,0,1,0,0,0,0,0,0,0,lesorciccio lankester merrin veteran catholic ...
3,"Serendipity Through Seasons ""Serendipity Throu...",0,0,0,0,0,0,1,0,0,serendipity season serendipity season heartwar...
4,The Liability Young and naive 19-year-old slac...,0,0,1,0,0,0,0,0,0,liability young naive numyearold slacker adam ...


In [39]:
model = gensim.models.Word2Vec(sentences = [t.split() for t in training_data['preprocessed_text'].to_list()])

In [44]:
def sim(term1, term2):
    try:
        similarity_score = model.wv.similarity(term1, term2)
        return similarity_score
    except KeyError as e:
        return 0

In [45]:
validation_file = pd.read_csv("Task-1-validation-dataset.csv", names=['index', 'term1', 'term2', 'score'])
validation_file.head()

Unnamed: 0,index,term1,term2,score
0,1,absorb,learn,5.48
1,2,absorb,withdraw,2.97
2,3,achieve,accomplish,8.57
3,4,achieve,try,4.42
4,6,acquire,get,8.82


In [46]:
all_similarity_vals = []
for index, row in validation_file.iterrows():
    similarity_val =  sim(row['term1'], row['term2'],)
    # print(similarity_val)
    all_similarity_vals.append(similarity_val)
    # print(all_similarity_vals)
    
validation_file['prediction_score'] = all_similarity_vals
validation_file.head()

Unnamed: 0,index,term1,term2,score,prediction_score
0,1,absorb,learn,5.48,0.227604
1,2,absorb,withdraw,2.97,0.567298
2,3,achieve,accomplish,8.57,0.857312
3,4,achieve,try,4.42,-0.061494
4,6,acquire,get,8.82,0.140506


In [47]:
prediction_df = validation_file[['index', 'prediction_score']]
prediction_df.head()

Unnamed: 0,index,prediction_score
0,1,0.227604
1,2,0.567298
2,3,0.857312
3,4,-0.061494
4,6,0.140506


In [49]:
prediction_df.to_csv('prediction_file_Task1_b.csv', header = False, index = False)