# Input Data for Training a Model

### Time to prepare the documents to train the model.
#### Every group of document are separate by language and label. I am going to unify the languages to make three different models, one per language. So, the input must be in the same language and with the specific label.

In [1]:
# Imports
import pandas as pd
import numpy as np
import re
import glob, os


import spacy
from spacy.lang.en.stop_words import STOP_WORDS
STOP_WORDS_en = STOP_WORDS
from spacy.lang.fr.stop_words import STOP_WORDS
STOP_WORDS_fr = STOP_WORDS

# 0. Preparing Data
In this part we are going to read the documents in the folders and join then in the same dataframe with the different labels for the training model. Because we are going to create 3 different models -one for each language- we are going to separate the dataframes because of the language.

In [90]:
def union_df(path, label):
    df_all = pd.DataFrame()
    for f in glob.glob(path + '/*.txt'):
        doc = [open(f, encoding='utf-8').read()]
        df = pd.DataFrame(doc, columns=['text'])
        df['label'] = label
        #df['file_name'] = re.findall('.+\/(.+\.txt)', f)
        df = df[['label', 'text']]
        df_all = pd.concat([df_all,df], ignore_index = True)
    return df_all

In [91]:
label1 = '__label__APR'
label2 = '__label__Conference_papers'
label3 = '__label__PAN11'
label4 = '__label__Wikipedia'

# 1. English

In [92]:
path1 = '/Users/Natalio/Desktop/nlp_associate_ds_test/NLP_Associate_DS_Test/data/documents_challenge/APR/en'

path2 = '/Users/Natalio/Desktop/nlp_associate_ds_test/NLP_Associate_DS_Test/data/documents_challenge/Conference_papers/en'

path3 = '/Users/Natalio/Desktop/nlp_associate_ds_test/NLP_Associate_DS_Test/data/documents_challenge/PAN11/en'

path4 = '/Users/Natalio/Desktop/nlp_associate_ds_test/NLP_Associate_DS_Test/data/documents_challenge/Wikipedia/en'


In [93]:
df_en1 = union_df(path1,label1)


In [94]:
df_en2 = union_df(path2,label2)


In [95]:
df_en3 = union_df(path3,label3)


In [96]:
df_en4 = union_df(path4,label4)

In [97]:
df_en = pd.concat([df_en1,df_en2, df_en3, df_en4], ignore_index = True)
df_en

Unnamed: 0,label,text
0,__label__APR,"i read this book because in my town, everyone ..."
1,__label__APR,recipes appreciated by the family (small and l...
2,__label__APR,i say no to ease ..... and not to the author w...
3,__label__APR,milady has found a good vein: anita blake. bas...
4,__label__APR,"460 bc, somewhere in greece: ""gentlemen, i dec..."
...,...,...
9644,__label__Wikipedia,"Bupyeong-gu, Incheon. | location_country =..."
9645,__label__Wikipedia,Freedom Call is a German power metal band form...
9646,__label__Wikipedia,majesty|consortname = Paola Ruffo di Calabriat...
9647,__label__Wikipedia,Sertã (pron. ) is a municipality in Portugal ...


In [98]:
len(df_en)

9649

In [101]:
df_en.to_csv('/Users/Natalio/Desktop/nlp_associate_ds_test/NLP_Associate_DS_Test/data/training_data/english.txt', sep=' ', header= False, index=False)

In [105]:
df2 = pd.read_csv('/Users/Natalio/Desktop/nlp_associate_ds_test/NLP_Associate_DS_Test/data/training_data/english.txt', sep= ' ', header=None)
df2

Unnamed: 0,0,1
0,__label__APR,"i read this book because in my town, everyone ..."
1,__label__APR,recipes appreciated by the family (small and l...
2,__label__APR,i say no to ease ..... and not to the author w...
3,__label__APR,milady has found a good vein: anita blake. bas...
4,__label__APR,"460 bc, somewhere in greece: ""gentlemen, i dec..."
...,...,...
9644,__label__Wikipedia,"Bupyeong-gu, Incheon. | location_country =..."
9645,__label__Wikipedia,Freedom Call is a German power metal band form...
9646,__label__Wikipedia,majesty|consortname = Paola Ruffo di Calabriat...
9647,__label__Wikipedia,Sertã (pron. ) is a municipality in Portugal ...


# 2. French

In [69]:
path1 = '/Users/Natalio/Desktop/nlp_associate_ds_test/NLP_Associate_DS_Test/data/documents_challenge/APR/fr'

path2 = '/Users/Natalio/Desktop/nlp_associate_ds_test/NLP_Associate_DS_Test/data/documents_challenge/Conference_papers/fr'

path4 = '/Users/Natalio/Desktop/nlp_associate_ds_test/NLP_Associate_DS_Test/data/documents_challenge/Wikipedia/fr'

In [70]:
df_fr1 = union_df(path1,label1)
df_fr2 = union_df(path2,label2)
df_fr4 = union_df(path4,label4)
df_fr = pd.concat([df_fr1, df_fr2, df_fr4], ignore_index = True)
df_fr

Unnamed: 0,text,label
0,"J'avais beaucoup aimé les premiers albums du ""...",APR
1,Je me joins aux commentaires peu satisfaits......,APR
2,"À sa parution en 1979, ce livre n'a pas rencon...",APR
3,Je découvre Douglas Kennedy et j'aimerais que ...,APR
4,J'ai acheté ce livre à la lecture des commenta...,APR
...,...,...
7710,", Nuremberg |années actives = depuis 1998 |gen...",Wikipedia
7711,Une cellule polyploïde (du grec : πολλαπλόν - ...,Wikipedia
7712,", George W. Bush et Albert II le .]]La reine ...",Wikipedia
7713,Sertã est une petite ville portugaise de 5 50...,Wikipedia


In [71]:
len(df_fr)

7715

In [72]:
df_fr.to_csv('/Users/Natalio/Desktop/nlp_associate_ds_test/NLP_Associate_DS_Test/data/training_data/french.csv', header= False, index=False)

# Spanish

In [76]:
path3 = '/Users/Natalio/Desktop/nlp_associate_ds_test/NLP_Associate_DS_Test/data/documents_challenge/PAN11/es'

path4 = '/Users/Natalio/Desktop/nlp_associate_ds_test/NLP_Associate_DS_Test/data/documents_challenge/Wikipedia/es'

In [77]:
df_es3 = union_df(path3,label3)
df_es4 = union_df(path4,label4)
df_es = pd.concat([df_es3, df_es4], ignore_index = True)
df_es

Unnamed: 0,text,label
0,"El primero dia pafamos aquellas Lagunas, i pa...",PAN11
1,"A la puesta del Sol, por vnos llanos, i entre...",PAN11
2,"\n\nLa Tierra, por la maior parte, defde donde...",PAN11
3,\n\nCAP. XXXVI. De como hecimos hacer Iglesias...,PAN11
4,¡Asombra el imaginar lo que hubiera dado este...,PAN11
...,...,...
4992,| fecha_de_fallecimiento = 8 de enero de 1981|...,Wikipedia
4993,Red Hat es la compañía responsable de la creac...,Wikipedia
4994,Bashkortostán (en ruso: Республика Башкортоста...,Wikipedia
4995,|zona=Polinesia |hablantes=165.000 (censo 200...,Wikipedia


In [78]:
len(df_es)

4997

In [79]:
df_es.to_csv('/Users/Natalio/Desktop/nlp_associate_ds_test/NLP_Associate_DS_Test/data/training_data/spanish.csv', header= False, index=False)

In [242]:
nlp = spacy.load('en')
stopwords = list(STOP_WORDS)

In [334]:
def spacy_tokenizer(sentence):

    tokens = nlp(sentence)
    
    filtered_tokens = []
    for word in tokens:
        lemma = word.lemma_.lower().strip()
        
        if lemma not in STOP_WORDS and word.lemma_ != "-PRON-"and re.search('^[a-zA-Z]+$', lemma):
            filtered_tokens.append(lemma)

    return filtered_tokens


In [356]:
def spacy_similarity(doc1, doc2):
    sim_doc1 = spacy_tokenizer(doc1)
    sim_doc2 = spacy_tokenizer(doc2)
    result = nlp(' '.join(sim_doc1)).similarity(nlp(' '.join(sim_doc2)))
    return result


In [357]:
spacy_similarity(doc1,doc2)

  after removing the cwd from sys.path.


0.8956579465399698

In [340]:
doc = df_all['text'].values
len(doc)

3600

In [360]:
rows_length = len(df_all.index)
rows_length

3600

In [367]:
for i in range(len(df_all.index)):
    #doc1 = df_all['text'][i].values
    for j in range(len(df_all.index)):
        #doc2 = df_all['text'][i+j].values
        df_all['sim'+str(i)] = i+j
    #df_all['text'].apply(spacy_similarity()
    
    
    

In [368]:
df_all

Unnamed: 0,text,file_name,sim0,sim1,sim2,sim3,sim4,sim5,sim6,sim7,...,sim3590,sim3591,sim3592,sim3593,sim3594,sim3595,sim3596,sim3597,sim3598,sim3599
0,"i read this book because in my town, everyone ...",apr-book-0-en.txt,3599,3600,3601,3602,3603,3604,3605,3606,...,7189,7190,7191,7192,7193,7194,7195,7196,7197,7198
1,recipes appreciated by the family (small and l...,apr-book-1-en.txt,3599,3600,3601,3602,3603,3604,3605,3606,...,7189,7190,7191,7192,7193,7194,7195,7196,7197,7198
2,i say no to ease ..... and not to the author w...,apr-book-10-en.txt,3599,3600,3601,3602,3603,3604,3605,3606,...,7189,7190,7191,7192,7193,7194,7195,7196,7197,7198
3,milady has found a good vein: anita blake. bas...,apr-book-100-en.txt,3599,3600,3601,3602,3603,3604,3605,3606,...,7189,7190,7191,7192,7193,7194,7195,7196,7197,7198
4,"460 bc, somewhere in greece: ""gentlemen, i dec...",apr-book-1000-en.txt,3599,3600,3601,3602,3603,3604,3605,3606,...,7189,7190,7191,7192,7193,7194,7195,7196,7197,7198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3595,i really wonder what is yet to coldplay. i thi...,apr-music-991-en.txt,3599,3600,3601,3602,3603,3604,3605,3606,...,7189,7190,7191,7192,7193,7194,7195,7196,7197,7198
3596,i'm pretty disappointed that it lacks the song...,apr-music-992-en.txt,3599,3600,3601,3602,3603,3604,3605,3606,...,7189,7190,7191,7192,7193,7194,7195,7196,7197,7198
3597,"i bought the cd and dvd, it is complemented pe...",apr-music-995-en.txt,3599,3600,3601,3602,3603,3604,3605,3606,...,7189,7190,7191,7192,7193,7194,7195,7196,7197,7198
3598,"good but after the disappointing ""follow the l...",apr-music-996-en.txt,3599,3600,3601,3602,3603,3604,3605,3606,...,7189,7190,7191,7192,7193,7194,7195,7196,7197,7198
