# Importing packages 

In [None]:
# Data wrangling 
import pandas as pd 

# Fastetext embeddings
import fasttext

# Importing regex 
import re 

# Array math 
import numpy as np

# Reading the data 

In [None]:
d = pd.read_csv("data/comments_to_score.csv")

In [None]:
print(f"Number of comments: {d.shape[0]}")

In [None]:
d.columns

# Loading the fasttext embeddings

In [None]:
embeddings = fasttext.load_model('embeddings/cc.en.300.bin')

# Cleaning the text 

In [None]:
def clean_text(text: str) -> str:
    """
    Function to clean the text for embedding creation
    """
    # Lowering 
    text = text.lower()
    
    # Leaving only the english letters and numerics
    text = text.replace('\n', ' ')

    # Removing the punctuations
    text = re.sub(r'[^\w\s]', ' ', text)

    # Removing the special characters
    text = re.sub('[^A-Za-z0-9]+', ' ', text)

    # Removing more than 1 whitespaces
    text = re.sub('\s+', ' ', text)

    return text

In [None]:
# Applying the function 
d['clean_text'] = [clean_text(x) for x in d['text']]

# Creating the embedding representations of the sentences 

In [None]:
vectors = [embeddings.get_sentence_vector(text) for text in d['clean_text']]

# Resizing 
vectors = np.reshape(vectors, (len(d), 300))

In [None]:
# Calculating the global mean of all the vectors 
global_mean = np.mean(vectors, axis=0)

# Calculating the euclidean distance between the average word vector and all the vectors
d['distance'] = [np.linalg.norm(global_mean - x) for x in vectors]

# Sorting by distance 
d.sort_values(by='distance', inplace=True)

# Renaming the distance metric to "score"
d.rename(columns={'distance': 'score'}, inplace=True)

In [None]:
# Most "average" comments
d.head(20)

In [None]:
# Most unique comments
d.tail(20)

# Making the sample submission


In [None]:
d[['comment_id', 'score']].to_csv("submission.csv", index=False)