# Data Mining Project

### Imports

In [74]:
from numpy import genfromtxt
import pandas as pd
import numpy as np
import operator, re, string, codecs, nltk
from statistics import mean
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
from xml.dom import minidom
from string import punctuation
from enum import Enum
try:
    maketrans = ''.maketrans
except AttributeError:
    from string import maketrans
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from collections import Counter

# Test

# from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# from sklearn import decomposition, ensemble

# import xgboost, textblob
# from keras.preprocessing import text, sequence
# from keras import layers, models, optimizers

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Program base

In [84]:
reduce_item_number = 900 #1 = fullset ; 10 = 1/10 of the set

def most_common_terms(terms):
    terms_count_array = []
    common_terms = []
    for term in terms:
        terms_count_array += term.split(" ")
    counter = Counter(terms_count_array)
    for word in counter.most_common():
        common_terms.append(word)
    return common_terms

class Type(Enum):
    REAL_CSV_SETS = 1
    TEST_ON_TRAINING_SET = 2

class ArticlePredictorBase:
    def __init__(self,test_type,path_to_csv_train,path_to_csv_test):
        if Type.REAL_CSV_SETS == test_type:
            
            #Import of the test set
            try:
                test = pd.read_csv(path_to_csv_test,delimiter=',')
                self.test_x = test.iloc[:,1]
                self.test_y = test.iloc[:,2]
            except:
                print("Error while importing testing set csv file") 
                self.type = None
            
            #Import of the train set
            try:
                train = pd.read_csv(path_to_csv_train,delimiter=',')
                self.train_x = train.iloc[:,1]
                self.train_y = train.iloc[:,2]
            except:
                print("Error while importing training set csv file") 
                self.type = None
                
            self.type = test_type
                
        elif Type.TEST_ON_TRAINING_SET == test_type:
            
            #Import of the train/testing set
            try:
                data = pd.read_csv(path_to_csv_train,delimiter=',')
            except:
                print("Error while importing the csv file") 
                self.type = None
                
            self.type = test_type
                
            X = data.iloc[:,1]
            y = data.iloc[:,2]
            
            self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(X, y)
        else:
            self.type = None
            print('Error type')

    def preprocess(self, set_x, set_y):
        the_set = pd.DataFrame()
        the_set['file_name'] = list(set_x)
        the_set['deleted_stop_words'] = list(None for _ in range(the_set['file_name'].size))
        
        rover=0
        for i in range(rover,the_set['file_name'].size//reduce_item_number):
            # Path to the file of this element
            file_path = 'project/project/data/'+str(the_set['file_name'][i])
            # Content of the BODY element in the file
            itemlist = minidom.parse(file_path).getElementsByTagName('BODY')
            
            if len(itemlist) != 1:
                print('Error: XML file invalid')

            if itemlist[0].childNodes == []: #If the text is empty in the article
                continue
                
            text = itemlist[0].childNodes[0].nodeValue
            
            #Preprocess part
            
            text = text.lower() #Lowercase
            text = re.sub(r'\d+', '', text) #Deleting numbers
            text = text.translate(str.maketrans('','',string.punctuation)) #Deleting ponctuation

            #Tokenization    
            tokens = word_tokenize(text)
            before_stop = len(tokens)
            tokens = [i for i in tokens if not i in ENGLISH_STOP_WORDS]
            
            #Feature adding
            deleted_stop_words = before_stop/len(tokens)
            the_set['deleted_stop_words'][i] = before_stop/len(tokens)
            
            #print(the_set['features'][i])
            #tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
            #tfidf_vect.fit(text)
            #xtrain_tfidf =  tfidf_vect.transform(the_set['features'][i])
            #print(xtrain_tfidf)
            
            # Exemple pour récupérer les termes les plus fréquents
            # print(most_common_terms(tokens))
            
            
            #TODO preprocess (compute des trucs, ajouter des champs dans le set comme plus haut (deleted_stop_words))
       
        #print(the_set)
        return the_set, list(set_y)  #TODO Renvoyer X avec tous les nouveaux fields
            
    def test(self):
        print("(Testing Launched)")
        if self.type is None:
            print("Initialisation error ")
            raise

        test_x, test_y = self.preprocess(self.test_x, self.test_y)
        #TODO testing
        
    def train(self):
        print("(Training Launched)")
        if self.type is None:
            print("Initialisation error ")
            raise
            
        print(str(self.train_x.size)+" elements in the training set")
        train_x, train_y = self.preprocess(self.train_x, self.train_y)        
        print(train_x)   
        print(train_y)
                            
        #TODO training
      
#predictor = ArticlePredictorBase(Type.REAL_CSV_SETS,"project/project/train.csv","project/project/test.csv")
predictor = ArticlePredictorBase(Type.TEST_ON_TRAINING_SET,"project/project/train.csv",None)
predictor.train()


(Training Launched)
3600 elements in the training set
     file_name deleted_stop_words
0     1799.xml               None
1     1465.xml            1.78947
2     4835.xml            1.44615
3     1280.xml            1.46875
4     2592.xml               None
5      803.xml               None
6     1887.xml               None
7     3063.xml               None
8      462.xml               None
9     3421.xml               None
10    4241.xml               None
11    3401.xml               None
12    3215.xml               None
13    4296.xml               None
14    2288.xml               None
15    3368.xml               None
16    3983.xml               None
17    2316.xml               None
18    4631.xml               None
19    3622.xml               None
20    1870.xml               None
21     102.xml               None
22     783.xml               None
23    3212.xml               None
24     148.xml               None
25     938.xml               None
26     331.xml              

## Computation of 1 file

We test here with 1.xml data file

### Importation of the file

In [19]:
xmldoc = minidom.parse('project/project/data/1.xml')
itemlist = xmldoc.getElementsByTagName('BODY')

#Verification

if len(itemlist) != 1:
    print('Error: XML file invalid')
    sys.exit(0)

text = itemlist[0].childNodes[0].nodeValue

### Preprocessing

In [None]:
#Preprocess
text = text.lower()    
text = re.sub(r'\d+', '', text) #Deleting numbers
text = text.translate(str.maketrans('','',string.punctuation)) #Deleting ponctuation

#Preprocess ideas : https://medium.com/@datamonsters/text-preprocessing-in-python-steps-tools-and-examples-bf025f872908
     
#Tokenization    
tokens = word_tokenize(text)
print(str(len(tokens)) + " tokens BEFORE stop words removing")
tokens = [i for i in tokens if not i in ENGLISH_STOP_WORDS]
print(str(len(tokens)) + " tokens AFTER stop words removing")

#print(tokens)

#stemmer= PorterStemmer()
#tokens = set(map(stemmer.stem,tokens))
#print (tokens)

In [21]:
def most_common_terms(terms):
    terms_count_array = []
    for term in terms:
        terms_count_array += term.split(" ")
    counter = Counter(terms_count_array)
    for word in counter.most_common():
        print(word)
        
most_common_terms(tokens)

('dlrs', 14)
('new', 9)
('york', 8)
('sales', 7)
('times', 7)
('cocoa', 6)
('comissaria', 5)
('smith', 5)
('said', 5)
('bags', 5)
('mln', 5)
('crop', 5)
('bahia', 4)
('february', 3)
('total', 3)
('ports', 3)
('junejuly', 3)
('augsept', 3)
('marchapril', 3)
('octdec', 3)
('dec', 3)
('week', 2)
('temporao', 2)
('period', 2)
('year', 2)
('arrivals', 2)
('kilos', 2)
('consignment', 2)
('figures', 2)
('farmers', 2)
('shippers', 2)
('sold', 2)
('bean', 2)
('shipment', 2)
('limited', 2)
('tonne', 2)
('open', 2)
('july', 2)
('butter', 2)
('sept', 2)
('currency', 2)
('areas', 2)
('uruguay', 2)
('showers', 1)
('continued', 1)
('zone', 1)
('alleviating', 1)
('drought', 1)
('early', 1)
('january', 1)
('improving', 1)
('prospects', 1)
('coming', 1)
('normal', 1)
('humidity', 1)
('levels', 1)
('restored', 1)
('weekly', 1)
('review', 1)
('dry', 1)
('means', 1)
('late', 1)
('ended', 1)
('making', 1)
('cumulative', 1)
('season', 1)
('stage', 1)
('delivered', 1)
('earlier', 1)
('included', 1)
('doubt', 