In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/link-prediction-outputs-new/sim_score_output (1).csv
/kaggle/input/dsaa-2023-competition/sample_submission.csv
/kaggle/input/dsaa-2023-competition/train.csv
/kaggle/input/dsaa-2023-competition/test.csv
/kaggle/input/dsaa-2023-competition/nodes/nodes.tsv
/kaggle/input/link-prediction-outputs-new1/test_sim_score_output (2).csv


In [7]:
# Reading inputs
train = pd.read_csv("/kaggle/input/dsaa-2023-competition/train.csv")
test = pd.read_csv("/kaggle/input/dsaa-2023-competition/test.csv")
nodes = pd.read_csv('/kaggle/input/dsaa-2023-competition/nodes/nodes.tsv',sep ='\t')
sample_sub = pd.read_csv("/kaggle/input/dsaa-2023-competition/sample_submission.csv")

#### We are using two models - First model will give a score based on adjacent nodes connections and second model will give score based on text similarity between nodes description. Then we will combine both the predictions to give final score

### Model 1 - Prediction for adjacent nodes (Graph structure)

In [8]:
# Graph score

# Total nodes for graph
total_nodes = nodes['id'].dropna().nunique()

#Available edges
df_edges = train[train['label'] == 1][['id1','id2']]



# Graph class with functions for defining graph structure,add edges and a function if some path exist
class Graph:
 
    # init function to declare class variables with V as number of vertices or nodes
    def __init__(self, V):
        self.V = V
        self.adj = [[] for i in range(V)]
 
     # method to add an undirected edge
    def addEdge(self, v, w):
        self.adj[v-1].append(w)
        self.adj[w-1].append(v)
        
    def path_exist(self,u,v,depth=5):
        
        if u==v:
            return 1
        
        if u == "" or v == "":
            return 0

        connections = self.adj[u-1]
        for i in connections:
            if v == i:
                return 1

        while depth >0:   
            temp =[]
            for i in connections:
                temp.append(self.adj[i-1])

            connections = sum(temp,[])

            for i in connections:
                if v == i:
                    return 1

            depth = depth-1
        return 0    




# Creating Grapgh for training data and adding edges
g_submission = Graph(total_nodes) # Graph
df_edges.apply(lambda x : g_submission.addEdge(x['id1'],x['id2']),axis =1) # Adding edges

# Finding if path exist between test submissiondata nodes upto level 3 depth
test['pred_graph'] = test.apply(lambda x: g_submission.path_exist(x['id1'],x['id2'],depth =3),axis =1)

### Model 2 - Predcition from text similarity scores

In [9]:
## All the functions needed for text similairty
# Similarity Score function

def sim_score(text1,text2):
    text1 = set(text1.split(" "))
    text2 = set(text2.split(" "))
    
    intersection = len(text1 & text2)
    union = len(text1.union(text2))
    
    score = intersection/union
    return score

# Text cleaning function
import re
import nltk

def clean_text(raw_text):
    
    #Keep only alphabets and spaces and converting everything to lower case
    cleaned_text = re.sub('[^a-zA-Z ]+',"",raw_text)
    cleaned_text = " ".join(cleaned_text.split())
    cleaned_text = cleaned_text.lower()
    
    
    #Lemmatization - convert them to their root words
    # Leaving it for now due to computational efficiency
        
    return cleaned_text


# identify most frequent word in all descriptions ( and remove them from descriptions like stopwords)
# Raw nodes decription and their cleaning
nodes_text = nodes.dropna()
nodes_text = nodes_text.drop_duplicates(subset=['id'])
nodes_text['cleaned_text'] = nodes_text['text'].apply(lambda x : clean_text(x))

# Getting frequency of each word in all text
word_freq = {}

nodes_desc_list = nodes_text['cleaned_text'].tolist()   
for nodes_desc in nodes_desc_list:
    for word in nodes_desc.split(' '):
        if word in word_freq.keys():
            word_freq[word] = word_freq[word] + 1
        else:
            word_freq[word] = 1
            
word_freq = pd.DataFrame.from_dict(word_freq.items())
word_freq.columns = ['word','frequency']
word_freq = word_freq[word_freq['word'] != '' ]

# Selecting most frequent stopwords
selected_stopwords = word_freq[word_freq['frequency'] > 40000]['word'].tolist()


# Adding these selected stopwords to nltk corpus stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words = stop_words + selected_stopwords

#removing stopwords
def remove_stopwords(text,stopwords):
    text_without_stopwords = []
    for word in text.split(' '):
        if word in stopwords:
            pass
        else:
            text_without_stopwords.append(word)
    return " ".join(text_without_stopwords)

In [10]:
# text similarity score

# Get nodes desc across test data
test_modified = pd.merge(test,nodes_text[["id","text","cleaned_text"]].rename(columns ={"id": "id1","text":"id1_text","cleaned_text":"id1_desc"}),on =['id1'],how = 'left')

test_modified = pd.merge(test_modified,nodes_text[["id","text","cleaned_text"]].rename(columns ={"id": "id2","text":"id2_text","cleaned_text":"id2_desc"}),on =['id2'],how = 'left')

#Get similarity score
test_modified['id1_desc'] = test_modified['id1_desc'].apply(lambda x : remove_stopwords(str(x),stop_words))
test_modified['id2_desc'] = test_modified['id2_desc'].apply(lambda x : remove_stopwords(str(x),stop_words))

test_modified['sim_score'] = test_modified.apply(lambda x:sim_score(x['id1_desc'],x['id2_desc']),axis=1)
test_modified = test_modified.drop(['id1_text','id1_desc','id2_text','id2_desc'],axis =1)

test_modified.rename(columns={'pred':'pred_text'},inplace=True)
test_modified['pred_text'] = test_modified['sim_score'].apply(lambda x : 1 if x > 0.008 else 0)

### Merging both predictions

In [11]:
final_submission_pred = pd.merge(test,test_modified[['id','id1','id2','sim_score','pred_text']],
                          on =['id','id1','id2'],how ='inner')

final_submission_pred['final_pred'] = final_submission_pred.apply(lambda x : 1 if x['pred_graph'] == 1 else x['pred_text'],axis =1)
final_submission_pred['final_pred'] = final_submission_pred['final_pred'].astype('int') 
submission = final_submission_pred[['id','final_pred']].rename(columns = {'final_pred':'label'})
submission.to_csv('submission3.csv',index = False)