In [21]:
#! pip install nltk
#!pip install textstat
#!pip install cupy cuml



## Read in the data

In [22]:
import pandas as pd


# Train: Medium + Hard
df_train = pd.read_csv("FINAL-DATA/train_balanced.csv")
df_train = df_train.loc[df_train['difficulty'].isin(['medium', 'hard'])]
print(len(df_train))

# Validation: Medium + Hard
df_val = pd.read_csv("FINAL-DATA/validation_balanced.csv")
df_val = df_val.loc[df_val['difficulty'].isin(['medium', 'hard'])]
print(len(df_val))

df_test = pd.read_csv("FINAL-DATA/test.csv")
print(len(df_test))

df_test_easy = df_test[df_test["difficulty"] == "easy"]
df_test_medium = df_test[df_test["difficulty"] == "medium"]
df_test_hard = df_test[df_test["difficulty"] == "hard"]

36654
5122
5595


In [31]:
df_train.reset_index(inplace= True)
df_test_medium.reset_index(inplace= True)
df_test_hard.reset_index(inplace= True)

## Functions for extracting features and calculating similarity

In [24]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from nltk.metrics.distance import jaccard_distance
import numpy as np
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


#Preprocessing, required for some features
def preprocess_text(text):
    # Tokenize text into words
    words = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return words


#Feature 1 = word frequency, dict
def extract_word_freq_features(paragraph):
    # Preprocess the paragraph
    words = preprocess_text(paragraph)
    
    # Count word frequencies
    word_freq = Counter(words)
    
    return word_freq


#Feature 2 = tfidf, float
def extract_tfidf_features(paragraph1, paragraph2):
    # Create TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(tokenizer=preprocess_text)
    
    # Fit and transform paragraphs
    tfidf_matrix = tfidf_vectorizer.fit_transform([paragraph1, paragraph2])
    
    # Calculate cosine similarity
    similarity_score = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    
    return similarity_score



# Feature 3 = avg_sent_len, int
# Feature 4 = punctuation counts, dict
def extract_sentence_structure_features(paragraph):
    # Tokenize the paragraph into sentences
    sentences = sent_tokenize(paragraph)
    avg_sentence_length = sum(len(word_tokenize(sent)) for sent in sentences) / len(sentences)
    punctuation_counts = Counter(token for sent in sentences for token in word_tokenize(sent) if token in (',', '.', '!', '?'))
    
    return avg_sentence_length, punctuation_counts

# Feature 5 = pos_tags
def extract_pos_tag_features(paragraph):
    # Tokenize the paragraph into words
    words = word_tokenize(paragraph)
    
    # Get POS tags
    pos_tags = pos_tag(words)
    
    # Count POS tag frequencies
    pos_tag_freq = Counter(tag for word, tag in pos_tags)
    
    return pos_tag_freq

# Feature 6 = Reading Ease
def extract_reading_ease(paragraph):
    flesch_reading_ease = textstat.flesch_reading_ease(paragraph)
    return flesch_reading_ease

# Feature 7 = whole tfidf
def create_tfidf_matrix(series):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(series)
    feature_names = vectorizer.get_feature_names_out()
    df_sparse = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=feature_names)
    return df_sparse

def tfidf_similarities(df1, df2):
    '''
    cos_sims = []
    for i in range(100):
        cos_sim = cosine_similarity(np.vstack([tfidf_matrix_1_filtered.iloc[i], tfidf_matrix_2_filtered.iloc[i]]), dense_output=False)
        cos_sims.append(cos_sim[0][1])
    '''
    norms1 = np.linalg.norm(df1, axis=1)
    norms2 = np.linalg.norm(df2, axis=1)

    # Ensure no division by zero
    epsilon = 1e-10
    norms1 = np.maximum(norms1, epsilon)
    norms2 = np.maximum(norms2, epsilon)

    # Calculate the dot product of corresponding rows
    dot_products = np.einsum('ij,ij->i', df1, df2)

    # Calculate the cosine similarity
    cosine_similarity_vector = dot_products / (norms1 * norms2)
    return cosine_similarity_vector

# Functions for computing similarity between paragraphs
def compare_numerical_features(feature1, feature2):
    # Compute cosine similarity between feature vectors
    return 1 - cosine(feature1, feature2)

def compare_categorical_features(feature1, feature2):
    # Compute Jaccard similarity between sets
    if len(set(feature1.keys())) == 0 and len(set(feature2.keys())) == 0:
        return 0
    return 1 - jaccard_distance(set(feature1.keys()), set(feature2.keys()))  
    

def extract_similarity(para_1, para_2):
    # Word freq (jac_sim)
    word_freq_features1 = extract_word_freq_features(para_1)
    word_freq_features2 = extract_word_freq_features(para_2)
    
    jaccard_similarity_word_freqs = compare_categorical_features(word_freq_features1, word_freq_features2)

    # tfidf (cos_sim)
    tf_idf_similarity_score = extract_tfidf_features(para_1, para_2)

    # Sent structure:
    avg_sentence_length_1, punctuation_counts_1 = extract_sentence_structure_features(para_1)
    avg_sentence_length_2, punctuation_counts_2 = extract_sentence_structure_features(para_2)

    #sent len (proportion):
    sent_len_diff = (avg_sentence_length_2/avg_sentence_length_1 if avg_sentence_length_1 != 0 else 0)

    #punctuation difference (jac_sim):
    jaccard_similarity_punct = compare_categorical_features(punctuation_counts_1, punctuation_counts_2)

    #pos_tag (jac_sim):
    pos_tag_features_1 = extract_pos_tag_features(para_1)
    pos_tag_features_2 = extract_pos_tag_features(para_2)
    jaccard_similarity_punct = compare_categorical_features(pos_tag_features_1, pos_tag_features_2)


    return jaccard_similarity_word_freqs, tf_idf_similarity_score, sent_len_diff, jaccard_similarity_punct, jaccard_similarity_punct

def extract_ease_sim(para1, para2):
    ease_1 = extract_reading_ease(para1)
    ease_2 = extract_reading_ease(para2)

    return ease_2 - ease_1



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\milos\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\milos\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\milos\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\milos\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Calculating features

In [25]:
def calculate_base_features(df):
    #df_100 = df.head(100)
    df['f1'] = pd.Series()
    df['f2'] = pd.Series()
    df['f3'] = pd.Series()
    df['f4'] = pd.Series()
    df['f5'] = pd.Series()
    df['f6'] = pd.Series()

    f1 = []
    f2 = []
    f3 = []
    f4 = []
    f5 = []
    f6 = []

    for i in range(len(df)):
        features_in_row = list(extract_similarity(df["paragraph1"][i], df["paragraph2"][i]))
        f1.append(features_in_row[0])
        f2.append(features_in_row[1])
        f3.append(features_in_row[2])
        f4.append(features_in_row[3])
        f5.append(features_in_row[4]) 
        f6_val  = extract_ease_sim(df["paragraph1"][i], df["paragraph2"][i])
        f6.append(f6_val)

    df['f1'] = f1
    df['f2'] = f2
    df['f3'] = f3
    df['f4'] = f4
    df['f5'] = f5
    df['f6'] = f6

    return df


### Tfidf on whole dataset

In [26]:
def calculate_tfidf_col(df):
    tfidf_matrix_1 = create_tfidf_matrix(df["paragraph1"])
    tfidf_matrix_2 = create_tfidf_matrix(df["paragraph2"])

    common_columns = tfidf_matrix_1.columns.intersection(tfidf_matrix_2.columns)


    #tfidf_matrix_1 = tfidf_matrix_1.head(10000)
    #tfidf_matrix_2 = tfidf_matrix_2.head(10000)

    tfidf_matrix_1_filtered = tfidf_matrix_1[common_columns].astype(np.float16)
    tfidf_matrix_2_filtered = tfidf_matrix_2[common_columns].astype(np.float16)


    cosine_similarities = tfidf_similarities(tfidf_matrix_1_filtered, tfidf_matrix_2_filtered)

    df['f7'] = cosine_similarities

    df['f7'] = df['f7'].fillna(0) 

    return df


In [27]:
def calculate_all_features(df):
    calculate_base_features(df)
    calculate_tfidf_col(df)
    return df

In [28]:
test_features_dataframe = calculate_all_features(df_test_medium)
test_features_dataframe.to_csv("test_features_dataframe_medium.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['f1'] = pd.Series()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['f2'] = pd.Series()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['f3'] = pd.Series()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave

KeyboardInterrupt: 

In [32]:
test_features_dataframe_hard = calculate_all_features(df_test_hard)
test_features_dataframe_hard.to_csv("test_features_dataframe_hard.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['f1'] = pd.Series()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['f2'] = pd.Series()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['f3'] = pd.Series()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave

In [None]:
train_features_dataframe = calculate_all_features(df_train)
train_features_dataframe.to_csv("train_features_dataframe.csv")

  cosine_similarity_vector = dot_products / (norms1 * norms2)


### Balancing the dataset

In [None]:
print(train_features_dataframe['label'].value_counts())

label
0    18327
1    18327
Name: count, dtype: int64


In [None]:
#!pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import SMOTE
'''
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check the new class distribution
print(pd.Series(y_resampled).value_counts())
'''

label
0    18327
1    18327
Name: count, dtype: int64


In [41]:
from sklearn.preprocessing import StandardScaler

def prep_data_for_model(train_df, test_df):
    X_train = train_df[["f1", "f2", "f3", "f4", "f5", "f6", "f7"]]
    y_train = train_df["label"]

    X_test = test_df[["f1", "f2", "f3", "f4", "f5", "f6", "f7"]]
    y_test = test_df["label"]


    scaler = StandardScaler()

    # Fit the scaler on the training data and transform it
    X_train_scaled = scaler.fit_transform(X_train)

    # Transform the test data
    X_test_scaled = scaler.transform(X_test)

    #df.head(10)[["f1", "f2", "f3", "f4", "f5"]]
    return [X_train_scaled, X_test_scaled, y_train, y_test]


### First model - linear regression

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

def logistic_regression(data_list):
    X_train, X_test, y_train, y_test = data_list[0], data_list[1], data_list[2], data_list[3]
    model = LogisticRegression()

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    predictions = model.predict(X_test)

    # Evaluate the model
    f1_score_reg = f1_score(y_test, predictions)
    return f1_score_reg

### Second model - SVM

In [46]:
from sklearn.svm import SVC

def svm(data_list, kernels = ['rbf']):
    X_train, X_test, y_train, y_test = data_list[0], data_list[1], data_list[2], data_list[3]
    kernel_scores = {}
    for kernel in kernels:
        svm_model = SVC(kernel= kernel)  
        svm_model.fit(X_train, y_train)

        y_pred = svm_model.predict(X_test)

        f1_score_svm = f1_score(y_test, y_pred)
        kernel_scores[kernel] = f1_score_svm

    return kernel_scores


In [51]:
print(f"logistic regression f1 score, medium: {logistic_regression(prep_data_for_model(train_features_dataframe, test_features_dataframe))}")
print(f"logistic regression f1 score, hard: {logistic_regression(prep_data_for_model(train_features_dataframe, test_features_dataframe_hard))}")

print(f"svm f1 score, medium: {svm(prep_data_for_model(train_features_dataframe, test_features_dataframe))}")
print(f"svm f1 score, hard: {svm(prep_data_for_model(train_features_dataframe, test_features_dataframe_hard))}")



logistic regression f1 score, medium: 0.6965648854961832
logistic regression f1 score, hard: 0.545751633986928
svm f1 score, medium: {'rbf': 0.773851590106007}
svm f1 score, hard: {'rbf': 0.5743695316520844}
