In [2]:
### download necessary data
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltkStopWords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arkeshkalathiya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/arkeshkalathiya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/arkeshkalathiya/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
import pandas as pd
import requests
import re


In [5]:
## load data


def removeNumber(comment):
    return re.sub(r'\d+',"",comment)

def removeDoubleDots(comment):
    return re.sub(r'\.{2,}',"",comment)

def removeDoubleSpace(comment):
    return re.sub(r'[ ]{2,}'," ",comment)

def removeTalk(comment):
    return re.sub(r'\(talk\)',"",comment)

def removePunctuations(comment):
    return re.sub(r'[^\w\s]','',comment)

def removeNewLine(comment):
    return re.sub(r'\n'," ",comment)

def removeURLs(comment):
    return re.sub(r'^https?:\/\/.*[\r\n]*', '', comment, flags=re.MULTILINE)


def lemmatize(comment):
    wnl = WordNetLemmatizer()
    tokens = [wnl.lemmatize(word) for word in nltk.word_tokenize(comment)]
    return " ".join(tokens)


def processComment(comment):
    comment = removeNumber(comment)
    comment = removeNewLine(comment)
    comment = removeTalk(comment)
    comment = removeDoubleSpace(comment)
    comment = removeDoubleDots(comment)
    comment = removePunctuations(comment)
    comment = lemmatize(comment)
    comment = removeURLs(comment)
    return comment.lower()    


data = pd.read_csv('./data/train.csv')
data['comment_text'] = data['comment_text'].apply(processComment)





In [6]:
model = Word2Vec(sentences=data.comment_text.apply(lambda x : nltk.word_tokenize(x)), vector_size=100, window=5, min_count=1, workers=10)


In [8]:
model.wv.save('./custom_embeddings.model')