In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cohenintextorg2/Chohen intext org.xlsx
/kaggle/input/d1intextorg1/Arshad D1.xlsx


In [5]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split

In [6]:
# Load the data
data =  pd.read_excel('/kaggle/input/cohenintextorg2/Chohen intext org.xlsx')
data.head()

Unnamed: 0,CitingPaperId,CitedPaperId,String,Label
0,1872080baa7d30ec8fb87be9a65358cd3a7fb649,894be9b4ea46a5c422e81ef3c241072d4c73fdc0,"However, how frataxin interacts with the Fe-S ...",0
1,ce1d09a4a3a8d7fd3405b9328f65f00c952cf64b,b6642e19efb8db5623b3cc4eef1c5822a6151107,"In the study by Hickey et al. (2012), spikes w...",0
2,d9f3207db0c79a3b154f3875c9760cc6b056904b,2cc6ff899bf17666ad35893524a4d61624555ed7,By clustering with lowly aggressive close kin ...,0
3,88b86556857f4374842d2af2e359576806239175,a5bb0ff1a026944d2a47a155462959af2b8505a8,Ophthalmic symptoms are rare manifestations of...,0
4,df2f5d253798a83b31b1df8d4a343bdcdfeb492b,d91f4ce0487619e1ff3f30facd959e2530bde365,Recent studies identified Wee1 as a potential ...,0


In [7]:
data = data.drop(['CitingPaperId', 'CitedPaperId'], axis=1)

In [8]:
data.head()

Unnamed: 0,String,Label
0,"However, how frataxin interacts with the Fe-S ...",0
1,"In the study by Hickey et al. (2012), spikes w...",0
2,By clustering with lowly aggressive close kin ...,0
3,Ophthalmic symptoms are rare manifestations of...,0
4,Recent studies identified Wee1 as a potential ...,0


# Spliting the data

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data['String'], data['Label'], test_size=0.2, random_state=42)

#  Preprocess the text data

In [10]:
# Preprocess the text data
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)


In [11]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

In [103]:
# Pad the sequences to a fixed length
max_length = 300
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')

# Taning the word2vec  model and siting window size

In [101]:
from gensim.models import Word2Vec

# Assuming X_train is a NumPy array of sentences
# Convert the array elements to strings if they are not already
sentences = [str(sentence) for sentence in X_train]

# Split each sentence into a list of words
split_sentences = [sentence.split() for sentence in sentences]

# Train the Word2Vec model
w2v_model = Word2Vec(split_sentences, vector_size=300, window=20, min_count=5, workers=4)


In [102]:
# Create a weight matrix for the embedding layer
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# Balanceing dataset

In [104]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


# # Check the balance of the dataset after applying SMOTE

In [105]:
# Check the balance of the dataset after applying SMOTE
unique, counts = np.unique(y_train_smote, return_counts=True)
class_counts = dict(zip(unique, counts))
print("Class Counts after SMOTE:", class_counts)

Class Counts after SMOTE: {0: 3613, 1: 3613}


# Define the deep learning models

In [106]:
# Define the CNN model
cnn_model = Sequential()
cnn_model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False))
cnn_model.add(Conv1D(128, 5, activation='relu'))
cnn_model.add(MaxPooling1D(5))
cnn_model.add(Conv1D(128, 5, activation='relu'))
cnn_model.add(MaxPooling1D(5))
cnn_model.add(Flatten())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dense(1, activation='sigmoid'))

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


#Define GRU Model

from keras.layers import GRU

gru_model = Sequential()
gru_model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False))
gru_model.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))
gru_model.add(Dense(1, activation='sigmoid'))

gru_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


# Define LSTM Model

from keras.layers import LSTM

# Define the LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False))
lstm_model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Runing the model**

In [107]:
cnn_model.fit(X_train_smote,y_train_smote, epochs=10, batch_size=32, validation_data=(X_test, y_test))
# Train the GRU model
gru_model.fit(X_train_smote, y_train_smote, epochs=10, batch_size=32, validation_data=(X_test, y_test))
# Train the LSTM model
lstm_model.fit(X_train_smote, y_train_smote, epochs=10, batch_size=32, validation_data=(X_test, y_test))



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6102fe20b0>

Cheecking Performance on different window sizes

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Embedding, GRU, LSTM
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from gensim.models import Word2Vec

# Load the data
data = pd.read_excel('/kaggle/input/cohenintextorg2/Chohen intext org.xlsx')
X_train, X_test, y_train, y_test = train_test_split(data['String'], data['Label'], test_size=0.2, random_state=42)

In [6]:
# Preprocess the text data
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)

# Tokenize and pad the sequences
max_length = 300
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')

vocab_size = len(tokenizer.word_index) + 1

# Apply SMOTE to balance the dataset
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)




In [7]:
# Evaluate different window sizes
window_sizes = [2, 3, 5, 8, 10, 12]
results = {'Window': [], 'Model': [], 'Accuracy': [], 'Precision': []}

for window_size in window_sizes:
    print(f"Training Word2Vec with window size: {window_size}")
    
    # Convert the array elements to strings
    sentences = [str(sentence) for sentence in X_train]
    split_sentences = [sentence.split() for sentence in sentences]
    
    # Train the Word2Vec model
    w2v_model = Word2Vec(split_sentences, vector_size=300, window=window_size, min_count=5, workers=4)
    
    # Create embedding matrix
    embedding_matrix = np.zeros((vocab_size, 300))
    for word, i in tokenizer.word_index.items():
        if word in w2v_model.wv:
            embedding_matrix[i] = w2v_model.wv[word]

Training Word2Vec with window size: 2
Training Word2Vec with window size: 3
Training Word2Vec with window size: 5
Training Word2Vec with window size: 8
Training Word2Vec with window size: 10
Training Word2Vec with window size: 12


In [15]:
# Define models
def define_cnn():
        model = Sequential()
        model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False))
        model.add(Conv1D(128, 5, activation='relu'))
        model.add(MaxPooling1D(5))
        model.add(Conv1D(128, 5, activation='relu'))
        model.add(MaxPooling1D(5))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model

def define_gru():
        model = Sequential()
        model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False))
        model.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model

def define_lstm():
        model = Sequential()
        model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False))
        model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model

In [22]:
# Train and evaluate CNN
cnn_model = define_cnn()
cnn_model.fit(X_train_smote, y_train_smote, epochs=5, batch_size=32, verbose=0)
y_pred = (cnn_model.predict(X_test) > 0.5).astype("int32")
cnn_acc = accuracy_score(y_test, y_pred)
cnn_prec = precision_score(y_test, y_pred)
results['Window'].append(window_size)
results['Model'].append('CNN')
results['Accuracy'].append(cnn_acc)
results['Precision'].append(cnn_prec)
    
    # Train and evaluate GRU
gru_model = define_gru()
gru_model.fit(X_train_smote, y_train_smote, epochs=5, batch_size=32, verbose=0)
y_pred = (gru_model.predict(X_test) > 0.5).astype("int32")
gru_acc = accuracy_score(y_test, y_pred)
gru_prec = precision_score(y_test, y_pred)
results['Window'].append(window_size)
results['Model'].append('GRU')
results['Accuracy'].append(gru_acc)
results['Precision'].append(gru_prec)
    
    # Train and evaluate LSTM
lstm_model = define_lstm()
lstm_model.fit(X_train_smote, y_train_smote, epochs=5, batch_size=32, verbose=0)
y_pred = (lstm_model.predict(X_test) > 0.5).astype("int32")
lstm_acc = accuracy_score(y_test, y_pred)
lstm_prec = precision_score(y_test, y_pred)
results['Window'].append(window_size)
results['Model'].append('LSTM')
results['Accuracy'].append(lstm_acc)
results['Precision'].append(lstm_prec)



In [None]:
# Convert results to DataFrame for easy charting
results_df = pd.DataFrame(results)

# Plotting results
plt.figure(figsize=(12, 6))
for model in ['CNN', 'GRU', 'LSTM']:
    model_data = results_df[results_df['Model'] == model]
    plt.plot(model_data['Window'], model_data['Accuracy'], label=f'{model} Accuracy')
    plt.plot(model_data['Window'], model_data['Precision'], label=f'{model} Precision', linestyle='--')

plt.xlabel('Window Size')
plt.ylabel('Score')
plt.title('Accuracy and Precision by Window Size for Different Models')
plt.legend()
plt.grid(True)
plt.show()