# TP6: Catégorisation de documents texte
###### _Nous nous sommes servis des fichiers présents sur le Moodle, ceci est notre code pour la génération de nos propres vecteurs de page_

In [230]:
import pandas as pd

### Importation du csv

In [231]:
df = pd.read_csv("simplewiki.csv", encoding='utf-8')

In [232]:
df["text"]

0         A '''demo''' is a small portion or sample of a...
1         {{Commonscat|Islands of North America}}\n\n[[C...
2         '''Mirrors and forks of Wikipedia''' are publi...
3         {{italictitle}}\n\n'''''Buck the World''''' is...
4         {{multiple issues|\n{{BLP unsourced|date=Augus...
                                ...                        
234020                       #REDIRECT [[March 24]]\n      
234021                            #REDIRECT [[NBC]]\n      
234022    [[Category:Arabic language|Words]]\n[[category...
234023    '''Slab Fork''' is an [[Unincorporated area|un...
234024    {{Infobox F1 team\n| Constructor_name   = {{fl...
Name: text, Length: 234025, dtype: object

#### Récupérer des textes avec une liste de titres donnée

In [237]:
getPages = lambda pages_to_search: [df['text'][df['title'] == page].item() for page in pages_to_search if len(df['text'][df['title'] == page]) > 0] 

In [238]:
pages_to_show = ['List of Nobel Prize winners in Chemistry',
                'List of Nobel Prize winners in Economics',
                'List of Nobel Prize winners in Literature',
                'List of Nobel Prize winners in Physics',
                'List of Nobel Prize winners in Physiology or Medicine']

data_text = getPages(pages_to_show)



  """Entry point for launching an IPython kernel.


In [239]:
len(data_text)

5

#### Récolte des différentes références dans chacune des listes de Prix Nobel

In [240]:
import re

all_valid_references = []

for text in data_text:
    all_valid_references.append(getPages([s.replace("[[","").replace("]]","") for s in re.findall(r'\[\[.*?\]\]', text)]))


  """Entry point for launching an IPython kernel.


91


In [248]:
len(all_valid_references)

5

#### Choix aléatoire de 25 des références par liste

In [249]:
import random

data_references = []

for list_references in all_valid_references:
    #Pour chaque liste, on prend 25 références aléatoirement
    data_references.extend(random.sample(list_references,25))
    

In [250]:
len(data_references)

125

##### Construction des vecteurs avec TF-IDF

In [251]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [252]:
tfidf = tfidf_vectorizer.fit_transform(data_references)

In [253]:
print(tfidf.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.04744128 0.01059616 0.00275113 ... 0.         0.00275113 0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


#### On met la matrice de TF-IDF en DataFrame

In [254]:
df = pd.DataFrame(tfidf.toarray())

##### Fonction d'ajout de la colonne de classe

In [259]:
def add_class_column(df, nb_elts_per_class):
    classes = []
    
    for classe in ['chemistry', 'economics','literature','physics','medicine']:
        classes.extend([classe]*nb_elts_per_class)
    
    df["Classes"] = classes 
    
    df.columns = [f'f{title+1}' if type(title) == int else title for title in df.columns]

#### Ajout de la colonne pour la classe

In [264]:
add_class_column(df, 25)

In [265]:
df #Vérification

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f9712,f9713,f9714,f9715,f9716,f9717,f9718,f9719,f9720,Classes
0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,chemistry
1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,chemistry
2,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,chemistry
3,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,chemistry
4,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,chemistry
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,medicine
121,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,medicine
122,0.047441,0.010596,0.002751,0.002751,0.0,0.0,0.0,0.048682,0.002751,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002751,0.0,medicine
123,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.013161,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,medicine


#### Exportation des données d'apprentissage

In [267]:
df.to_csv('train.csv', index=False, header=True)

#### 8. Mini jeu de données test non contenues dans les données d'apprentissage

In [270]:
unused_references = []

for i, list_references in enumerate(all_valid_references):
    #Pour chaque liste, on prend 5 références au hasard qui n'ont pas été tirées précédemment
    unused_references.extend(random.sample([ref for ref in list_references if ref not in data_references[25*i:25*(i+1)]], 5))

print(len(unused_references))

25


#### Construction des vecteurs des pages de test

In [271]:
tfidf = tfidf_vectorizer.fit_transform(unused_references)

In [272]:
print(tfidf.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.05880422 0.         ... 0.         0.         0.07533075]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.01225341 0.04161426 0.01776989 ... 0.         0.         0.        ]]


##### Ajout de la colonne des classes

In [273]:
df = pd.DataFrame(tfidf.toarray())
add_class_column(df, 5)

In [274]:
df #Vérification

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f1928,f1929,f1930,f1931,f1932,f1933,f1934,f1935,f1936,Classes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.041085,0.0,0.038517,0.0,...,0.036293,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,chemistry
1,0.0,0.0,0.0,0.0,0.130333,0.0,0.0,0.0,0.0,0.0,...,0.0,0.041658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,chemistry
2,0.0,0.0,0.0,0.0,0.021318,0.0,0.0,0.0,0.0,0.0,...,0.0,0.040883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,chemistry
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,chemistry
4,0.034149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036242,0.0,...,0.034149,0.026539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,chemistry
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.023046,0.0,0.0,0.0,0.0,0.0,0.0,0.043005,economics
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.011547,0.0,0.0,0.018064,0.0,0.0,0.0,0.0,economics
7,0.099882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.030358,0.0,0.0,0.0,0.0,0.0,0.0,economics
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.040151,0.0,0.0,economics
9,0.029233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.011359,0.0,0.0,0.017769,0.021196,0.0,0.042393,0.0,economics


#### Exportation du jeu de données de test

In [275]:
df.to_csv('test.csv', index=False, header=True)

#### Lecture des données d'entraînement et de test

In [64]:
train, test = pd.read_csv('nobel.train.csv', encoding='utf-8'), pd.read_csv('nobel.test.csv', encoding='utf-8')

### Utilisation du classificateur J48

In [1]:
import weka.core.jvm as jvm
from weka.classifiers import Classifier


cls = Classifier(classname="weka.classifiers.trees.J48")


ModuleNotFoundError: No module named 'weka.core'