# TP - Work with text data
---

## Imports

In [51]:
import pandas as pd

## Load data

In [52]:
data = pd.read_csv("authors_description.csv", header=None, names=['description'])

In [53]:
data.head()

Unnamed: 0,description
0,Marc Levy (born 16 October 1961) is a French n...
1,Emmanuel Carrère (born 9 December 1957) is a F...
2,Franck Thilliez (born 15 October 1973 in Annec...
3,"Kenneth Martin Follett, (born 5 June 1949) is..."
4,"Stephen Edwin King (born September 21, 1947) i..."


## Feature extraction

### Number of words

In [54]:
data['wordCount'] = data['description'].apply(lambda s: len(s.split(' '))) # TODO
data

Unnamed: 0,description,wordCount
0,Marc Levy (born 16 October 1961) is a French n...,10
1,Emmanuel Carrère (born 9 December 1957) is a F...,14
2,Franck Thilliez (born 15 October 1973 in Annec...,34
3,"Kenneth Martin Follett, (born 5 June 1949) is...",86
4,"Stephen Edwin King (born September 21, 1947) i...",190
5,Pierre Lemaitre (born 19 April 1951 in Paris) ...,110
6,Joël Dicker (born 1985) is a Swiss novelist.,8
7,"Baroness Fabienne-Claire Nothomb, better known...",127
8,"Harlan Coben (born January 4, 1962) is an Amer...",65
9,"Dame Agatha Mary Clarissa Christie, Lady Mallo...",425


In [55]:
# find information within the cell
data.iloc[0]['description']

'Marc Levy (born 16 October 1961) is a French novelist.'

### Number of  characters

In [56]:
data['characterCount'] = data['description'].str.len()
data

Unnamed: 0,description,wordCount,characterCount
0,Marc Levy (born 16 October 1961) is a French n...,10,54
1,Emmanuel Carrère (born 9 December 1957) is a F...,14,91
2,Franck Thilliez (born 15 October 1973 in Annec...,34,191
3,"Kenneth Martin Follett, (born 5 June 1949) is...",86,476
4,"Stephen Edwin King (born September 21, 1947) i...",190,1193
5,Pierre Lemaitre (born 19 April 1951 in Paris) ...,110,688
6,Joël Dicker (born 1985) is a Swiss novelist.,8,44
7,"Baroness Fabienne-Claire Nothomb, better known...",127,741
8,"Harlan Coben (born January 4, 1962) is an Amer...",65,394
9,"Dame Agatha Mary Clarissa Christie, Lady Mallo...",425,2660


### Number of numerics

**Tips:** 
- Use `isdigit` function

In [57]:
#data['numericCount'] = data['description'].apply(lambda s: len([char in s if char.isdigit()]))

data['numericCount'] = data['description'].str.count(r'\d')
data

Unnamed: 0,description,wordCount,characterCount,numericCount
0,Marc Levy (born 16 October 1961) is a French n...,10,54,6
1,Emmanuel Carrère (born 9 December 1957) is a F...,14,91,5
2,Franck Thilliez (born 15 October 1973 in Annec...,34,191,6
3,"Kenneth Martin Follett, (born 5 June 1949) is...",86,476,9
4,"Stephen Edwin King (born September 21, 1947) i...",190,1193,30
5,Pierre Lemaitre (born 19 April 1951 in Paris) ...,110,688,22
6,Joël Dicker (born 1985) is a Swiss novelist.,8,44,4
7,"Baroness Fabienne-Claire Nothomb, better known...",127,741,12
8,"Harlan Coben (born January 4, 1962) is an Amer...",65,394,5
9,"Dame Agatha Mary Clarissa Christie, Lady Mallo...",425,2660,77


### Number of upper case characters

In [58]:
#data['numericCount'] = data['description'].apply(lambda s: len([char for char in s if char.isupper()]))

data['isUppercase'] = data['description'].str.findall(r'[A-Z]').str.len()
data

Unnamed: 0,description,wordCount,characterCount,numericCount,isUppercase
0,Marc Levy (born 16 October 1961) is a French n...,10,54,6,4
1,Emmanuel Carrère (born 9 December 1957) is a F...,14,91,5,4
2,Franck Thilliez (born 15 October 1973 in Annec...,34,191,6,9
3,"Kenneth Martin Follett, (born 5 June 1949) is...",86,476,9,33
4,"Stephen Edwin King (born September 21, 1947) i...",190,1193,30,54
5,Pierre Lemaitre (born 19 April 1951 in Paris) ...,110,688,22,42
6,Joël Dicker (born 1985) is a Swiss novelist.,8,44,4,3
7,"Baroness Fabienne-Claire Nothomb, better known...",127,741,12,35
8,"Harlan Coben (born January 4, 1962) is an Amer...",65,394,5,10
9,"Dame Agatha Mary Clarissa Christie, Lady Mallo...",425,2660,77,111


## Text pre-processing

### Lowercase

**Tips:** 
- Use `description` column
- Use Pandas Series functions

In [59]:
data['lowercase'] = data['description'].str.lower()
data

Unnamed: 0,description,wordCount,characterCount,numericCount,isUppercase,lowercase
0,Marc Levy (born 16 October 1961) is a French n...,10,54,6,4,marc levy (born 16 october 1961) is a french n...
1,Emmanuel Carrère (born 9 December 1957) is a F...,14,91,5,4,emmanuel carrère (born 9 december 1957) is a f...
2,Franck Thilliez (born 15 October 1973 in Annec...,34,191,6,9,franck thilliez (born 15 october 1973 in annec...
3,"Kenneth Martin Follett, (born 5 June 1949) is...",86,476,9,33,"kenneth martin follett, (born 5 june 1949) is..."
4,"Stephen Edwin King (born September 21, 1947) i...",190,1193,30,54,"stephen edwin king (born september 21, 1947) i..."
5,Pierre Lemaitre (born 19 April 1951 in Paris) ...,110,688,22,42,pierre lemaitre (born 19 april 1951 in paris) ...
6,Joël Dicker (born 1985) is a Swiss novelist.,8,44,4,3,joël dicker (born 1985) is a swiss novelist.
7,"Baroness Fabienne-Claire Nothomb, better known...",127,741,12,35,"baroness fabienne-claire nothomb, better known..."
8,"Harlan Coben (born January 4, 1962) is an Amer...",65,394,5,10,"harlan coben (born january 4, 1962) is an amer..."
9,"Dame Agatha Mary Clarissa Christie, Lady Mallo...",425,2660,77,111,"dame agatha mary clarissa christie, lady mallo..."


### Replace Line Return (\n) by whitespace

**Tips :** Use `lowercase` column

In [60]:
data['withoutLineReturn'] = data['lowercase'].str.replace('\n', ' ')
data

Unnamed: 0,description,wordCount,characterCount,numericCount,isUppercase,lowercase,withoutLineReturn
0,Marc Levy (born 16 October 1961) is a French n...,10,54,6,4,marc levy (born 16 october 1961) is a french n...,marc levy (born 16 october 1961) is a french n...
1,Emmanuel Carrère (born 9 December 1957) is a F...,14,91,5,4,emmanuel carrère (born 9 december 1957) is a f...,emmanuel carrère (born 9 december 1957) is a f...
2,Franck Thilliez (born 15 October 1973 in Annec...,34,191,6,9,franck thilliez (born 15 october 1973 in annec...,franck thilliez (born 15 october 1973 in annec...
3,"Kenneth Martin Follett, (born 5 June 1949) is...",86,476,9,33,"kenneth martin follett, (born 5 june 1949) is...","kenneth martin follett, (born 5 june 1949) is..."
4,"Stephen Edwin King (born September 21, 1947) i...",190,1193,30,54,"stephen edwin king (born september 21, 1947) i...","stephen edwin king (born september 21, 1947) i..."
5,Pierre Lemaitre (born 19 April 1951 in Paris) ...,110,688,22,42,pierre lemaitre (born 19 april 1951 in paris) ...,pierre lemaitre (born 19 april 1951 in paris) ...
6,Joël Dicker (born 1985) is a Swiss novelist.,8,44,4,3,joël dicker (born 1985) is a swiss novelist.,joël dicker (born 1985) is a swiss novelist.
7,"Baroness Fabienne-Claire Nothomb, better known...",127,741,12,35,"baroness fabienne-claire nothomb, better known...","baroness fabienne-claire nothomb, better known..."
8,"Harlan Coben (born January 4, 1962) is an Amer...",65,394,5,10,"harlan coben (born january 4, 1962) is an amer...","harlan coben (born january 4, 1962) is an amer..."
9,"Dame Agatha Mary Clarissa Christie, Lady Mallo...",425,2660,77,111,"dame agatha mary clarissa christie, lady mallo...","dame agatha mary clarissa christie, lady mallo..."


### Removing Punctuation & Special Characters

**Tips :** 
- Use withoutLineReturn column
- Use `regex` to keep words. If you need to test regex, use http://regex101.com

In [61]:
data['withoutPunctuaction'] = data['withoutLineReturn'].str.replace('\W',' ')
data

  data['withoutPunctuaction'] = data['withoutLineReturn'].str.replace('\W',' ')


Unnamed: 0,description,wordCount,characterCount,numericCount,isUppercase,lowercase,withoutLineReturn,withoutPunctuaction
0,Marc Levy (born 16 October 1961) is a French n...,10,54,6,4,marc levy (born 16 october 1961) is a french n...,marc levy (born 16 october 1961) is a french n...,marc levy born 16 october 1961 is a french n...
1,Emmanuel Carrère (born 9 December 1957) is a F...,14,91,5,4,emmanuel carrère (born 9 december 1957) is a f...,emmanuel carrère (born 9 december 1957) is a f...,emmanuel carrère born 9 december 1957 is a f...
2,Franck Thilliez (born 15 October 1973 in Annec...,34,191,6,9,franck thilliez (born 15 october 1973 in annec...,franck thilliez (born 15 october 1973 in annec...,franck thilliez born 15 october 1973 in annec...
3,"Kenneth Martin Follett, (born 5 June 1949) is...",86,476,9,33,"kenneth martin follett, (born 5 june 1949) is...","kenneth martin follett, (born 5 june 1949) is...",kenneth martin follett born 5 june 1949 is...
4,"Stephen Edwin King (born September 21, 1947) i...",190,1193,30,54,"stephen edwin king (born september 21, 1947) i...","stephen edwin king (born september 21, 1947) i...",stephen edwin king born september 21 1947 i...
5,Pierre Lemaitre (born 19 April 1951 in Paris) ...,110,688,22,42,pierre lemaitre (born 19 april 1951 in paris) ...,pierre lemaitre (born 19 april 1951 in paris) ...,pierre lemaitre born 19 april 1951 in paris ...
6,Joël Dicker (born 1985) is a Swiss novelist.,8,44,4,3,joël dicker (born 1985) is a swiss novelist.,joël dicker (born 1985) is a swiss novelist.,joël dicker born 1985 is a swiss novelist
7,"Baroness Fabienne-Claire Nothomb, better known...",127,741,12,35,"baroness fabienne-claire nothomb, better known...","baroness fabienne-claire nothomb, better known...",baroness fabienne claire nothomb better known...
8,"Harlan Coben (born January 4, 1962) is an Amer...",65,394,5,10,"harlan coben (born january 4, 1962) is an amer...","harlan coben (born january 4, 1962) is an amer...",harlan coben born january 4 1962 is an amer...
9,"Dame Agatha Mary Clarissa Christie, Lady Mallo...",425,2660,77,111,"dame agatha mary clarissa christie, lady mallo...","dame agatha mary clarissa christie, lady mallo...",dame agatha mary clarissa christie lady mallo...


In [62]:
data.iloc[0]['withoutLineReturn']

'marc levy (born 16 october 1961) is a french novelist.'

### Remove stopwords

In [63]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\52333\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [64]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [65]:
stop

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

**Tips :** Use `withoutPunctuaction` column

In [66]:
data['withoutStopwords'] = data['withoutPunctuaction'].apply(lambda x: ' '.join([w for w in x.split() if w not in stop]))
data

Unnamed: 0,description,wordCount,characterCount,numericCount,isUppercase,lowercase,withoutLineReturn,withoutPunctuaction,withoutStopwords
0,Marc Levy (born 16 October 1961) is a French n...,10,54,6,4,marc levy (born 16 october 1961) is a french n...,marc levy (born 16 october 1961) is a french n...,marc levy born 16 october 1961 is a french n...,marc levy born 16 october 1961 french novelist
1,Emmanuel Carrère (born 9 December 1957) is a F...,14,91,5,4,emmanuel carrère (born 9 december 1957) is a f...,emmanuel carrère (born 9 december 1957) is a f...,emmanuel carrère born 9 december 1957 is a f...,emmanuel carrère born 9 december 1957 french a...
2,Franck Thilliez (born 15 October 1973 in Annec...,34,191,6,9,franck thilliez (born 15 october 1973 in annec...,franck thilliez (born 15 october 1973 in annec...,franck thilliez born 15 october 1973 in annec...,franck thilliez born 15 october 1973 annecy fr...
3,"Kenneth Martin Follett, (born 5 June 1949) is...",86,476,9,33,"kenneth martin follett, (born 5 june 1949) is...","kenneth martin follett, (born 5 june 1949) is...",kenneth martin follett born 5 june 1949 is...,kenneth martin follett born 5 june 1949 welsh ...
4,"Stephen Edwin King (born September 21, 1947) i...",190,1193,30,54,"stephen edwin king (born september 21, 1947) i...","stephen edwin king (born september 21, 1947) i...",stephen edwin king born september 21 1947 i...,stephen edwin king born september 21 1947 amer...
5,Pierre Lemaitre (born 19 April 1951 in Paris) ...,110,688,22,42,pierre lemaitre (born 19 april 1951 in paris) ...,pierre lemaitre (born 19 april 1951 in paris) ...,pierre lemaitre born 19 april 1951 in paris ...,pierre lemaitre born 19 april 1951 paris prix ...
6,Joël Dicker (born 1985) is a Swiss novelist.,8,44,4,3,joël dicker (born 1985) is a swiss novelist.,joël dicker (born 1985) is a swiss novelist.,joël dicker born 1985 is a swiss novelist,joël dicker born 1985 swiss novelist
7,"Baroness Fabienne-Claire Nothomb, better known...",127,741,12,35,"baroness fabienne-claire nothomb, better known...","baroness fabienne-claire nothomb, better known...",baroness fabienne claire nothomb better known...,baroness fabienne claire nothomb better known ...
8,"Harlan Coben (born January 4, 1962) is an Amer...",65,394,5,10,"harlan coben (born january 4, 1962) is an amer...","harlan coben (born january 4, 1962) is an amer...",harlan coben born january 4 1962 is an amer...,harlan coben born january 4 1962 american writ...
9,"Dame Agatha Mary Clarissa Christie, Lady Mallo...",425,2660,77,111,"dame agatha mary clarissa christie, lady mallo...","dame agatha mary clarissa christie, lady mallo...",dame agatha mary clarissa christie lady mallo...,dame agatha mary clarissa christie lady mallow...


### Tokenization

In [67]:
from nltk import tokenize
tokenizer = tokenize.SpaceTokenizer()

**Tips :** Use `withoutStopwords` column

In [68]:
data['tokenized'] = data['withoutStopwords'].apply(lambda x: tokenizer.tokenize(x))
data

Unnamed: 0,description,wordCount,characterCount,numericCount,isUppercase,lowercase,withoutLineReturn,withoutPunctuaction,withoutStopwords,tokenized
0,Marc Levy (born 16 October 1961) is a French n...,10,54,6,4,marc levy (born 16 october 1961) is a french n...,marc levy (born 16 october 1961) is a french n...,marc levy born 16 october 1961 is a french n...,marc levy born 16 october 1961 french novelist,"[marc, levy, born, 16, october, 1961, french, ..."
1,Emmanuel Carrère (born 9 December 1957) is a F...,14,91,5,4,emmanuel carrère (born 9 december 1957) is a f...,emmanuel carrère (born 9 december 1957) is a f...,emmanuel carrère born 9 december 1957 is a f...,emmanuel carrère born 9 december 1957 french a...,"[emmanuel, carrère, born, 9, december, 1957, f..."
2,Franck Thilliez (born 15 October 1973 in Annec...,34,191,6,9,franck thilliez (born 15 october 1973 in annec...,franck thilliez (born 15 october 1973 in annec...,franck thilliez born 15 october 1973 in annec...,franck thilliez born 15 october 1973 annecy fr...,"[franck, thilliez, born, 15, october, 1973, an..."
3,"Kenneth Martin Follett, (born 5 June 1949) is...",86,476,9,33,"kenneth martin follett, (born 5 june 1949) is...","kenneth martin follett, (born 5 june 1949) is...",kenneth martin follett born 5 june 1949 is...,kenneth martin follett born 5 june 1949 welsh ...,"[kenneth, martin, follett, born, 5, june, 1949..."
4,"Stephen Edwin King (born September 21, 1947) i...",190,1193,30,54,"stephen edwin king (born september 21, 1947) i...","stephen edwin king (born september 21, 1947) i...",stephen edwin king born september 21 1947 i...,stephen edwin king born september 21 1947 amer...,"[stephen, edwin, king, born, september, 21, 19..."
5,Pierre Lemaitre (born 19 April 1951 in Paris) ...,110,688,22,42,pierre lemaitre (born 19 april 1951 in paris) ...,pierre lemaitre (born 19 april 1951 in paris) ...,pierre lemaitre born 19 april 1951 in paris ...,pierre lemaitre born 19 april 1951 paris prix ...,"[pierre, lemaitre, born, 19, april, 1951, pari..."
6,Joël Dicker (born 1985) is a Swiss novelist.,8,44,4,3,joël dicker (born 1985) is a swiss novelist.,joël dicker (born 1985) is a swiss novelist.,joël dicker born 1985 is a swiss novelist,joël dicker born 1985 swiss novelist,"[joël, dicker, born, 1985, swiss, novelist]"
7,"Baroness Fabienne-Claire Nothomb, better known...",127,741,12,35,"baroness fabienne-claire nothomb, better known...","baroness fabienne-claire nothomb, better known...",baroness fabienne claire nothomb better known...,baroness fabienne claire nothomb better known ...,"[baroness, fabienne, claire, nothomb, better, ..."
8,"Harlan Coben (born January 4, 1962) is an Amer...",65,394,5,10,"harlan coben (born january 4, 1962) is an amer...","harlan coben (born january 4, 1962) is an amer...",harlan coben born january 4 1962 is an amer...,harlan coben born january 4 1962 american writ...,"[harlan, coben, born, january, 4, 1962, americ..."
9,"Dame Agatha Mary Clarissa Christie, Lady Mallo...",425,2660,77,111,"dame agatha mary clarissa christie, lady mallo...","dame agatha mary clarissa christie, lady mallo...",dame agatha mary clarissa christie lady mallo...,dame agatha mary clarissa christie lady mallow...,"[dame, agatha, mary, clarissa, christie, lady,..."


### Stemming

In [69]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

**Tips :** 
- Use `withoutStopwords` column
- If you want to transform a list of string into a string, you can use `' '.join([])`

In [70]:
data['stemming'] = data['withoutStopwords'].apply(lambda x: [stemmer.stem(word) for word in x.split(' ')])
data

Unnamed: 0,description,wordCount,characterCount,numericCount,isUppercase,lowercase,withoutLineReturn,withoutPunctuaction,withoutStopwords,tokenized,stemming
0,Marc Levy (born 16 October 1961) is a French n...,10,54,6,4,marc levy (born 16 october 1961) is a french n...,marc levy (born 16 october 1961) is a french n...,marc levy born 16 october 1961 is a french n...,marc levy born 16 october 1961 french novelist,"[marc, levy, born, 16, october, 1961, french, ...","[marc, levi, born, 16, octob, 1961, french, no..."
1,Emmanuel Carrère (born 9 December 1957) is a F...,14,91,5,4,emmanuel carrère (born 9 december 1957) is a f...,emmanuel carrère (born 9 december 1957) is a f...,emmanuel carrère born 9 december 1957 is a f...,emmanuel carrère born 9 december 1957 french a...,"[emmanuel, carrère, born, 9, december, 1957, f...","[emmanuel, carrèr, born, 9, decemb, 1957, fren..."
2,Franck Thilliez (born 15 October 1973 in Annec...,34,191,6,9,franck thilliez (born 15 october 1973 in annec...,franck thilliez (born 15 october 1973 in annec...,franck thilliez born 15 october 1973 in annec...,franck thilliez born 15 october 1973 annecy fr...,"[franck, thilliez, born, 15, october, 1973, an...","[franck, thilliez, born, 15, octob, 1973, anne..."
3,"Kenneth Martin Follett, (born 5 June 1949) is...",86,476,9,33,"kenneth martin follett, (born 5 june 1949) is...","kenneth martin follett, (born 5 june 1949) is...",kenneth martin follett born 5 june 1949 is...,kenneth martin follett born 5 june 1949 welsh ...,"[kenneth, martin, follett, born, 5, june, 1949...","[kenneth, martin, follett, born, 5, june, 1949..."
4,"Stephen Edwin King (born September 21, 1947) i...",190,1193,30,54,"stephen edwin king (born september 21, 1947) i...","stephen edwin king (born september 21, 1947) i...",stephen edwin king born september 21 1947 i...,stephen edwin king born september 21 1947 amer...,"[stephen, edwin, king, born, september, 21, 19...","[stephen, edwin, king, born, septemb, 21, 1947..."
5,Pierre Lemaitre (born 19 April 1951 in Paris) ...,110,688,22,42,pierre lemaitre (born 19 april 1951 in paris) ...,pierre lemaitre (born 19 april 1951 in paris) ...,pierre lemaitre born 19 april 1951 in paris ...,pierre lemaitre born 19 april 1951 paris prix ...,"[pierre, lemaitre, born, 19, april, 1951, pari...","[pierr, lemaitr, born, 19, april, 1951, pari, ..."
6,Joël Dicker (born 1985) is a Swiss novelist.,8,44,4,3,joël dicker (born 1985) is a swiss novelist.,joël dicker (born 1985) is a swiss novelist.,joël dicker born 1985 is a swiss novelist,joël dicker born 1985 swiss novelist,"[joël, dicker, born, 1985, swiss, novelist]","[joël, dicker, born, 1985, swiss, novelist]"
7,"Baroness Fabienne-Claire Nothomb, better known...",127,741,12,35,"baroness fabienne-claire nothomb, better known...","baroness fabienne-claire nothomb, better known...",baroness fabienne claire nothomb better known...,baroness fabienne claire nothomb better known ...,"[baroness, fabienne, claire, nothomb, better, ...","[baro, fabienn, clair, nothomb, better, known,..."
8,"Harlan Coben (born January 4, 1962) is an Amer...",65,394,5,10,"harlan coben (born january 4, 1962) is an amer...","harlan coben (born january 4, 1962) is an amer...",harlan coben born january 4 1962 is an amer...,harlan coben born january 4 1962 american writ...,"[harlan, coben, born, january, 4, 1962, americ...","[harlan, coben, born, januari, 4, 1962, americ..."
9,"Dame Agatha Mary Clarissa Christie, Lady Mallo...",425,2660,77,111,"dame agatha mary clarissa christie, lady mallo...","dame agatha mary clarissa christie, lady mallo...",dame agatha mary clarissa christie lady mallo...,dame agatha mary clarissa christie lady mallow...,"[dame, agatha, mary, clarissa, christie, lady,...","[dame, agatha, mari, clarissa, christi, ladi, ..."


In [71]:
print(data['stemming'].iloc[0])

['marc', 'levi', 'born', '16', 'octob', '1961', 'french', 'novelist']


## Advanced text processing

### N-grams

In [72]:
from nltk.util import ngrams

**Tips :** Use `tokenized` column

In [73]:
data['n-grams'] = data['tokenized'].apply(lambda x: list(ngrams(x,2)))
data

Unnamed: 0,description,wordCount,characterCount,numericCount,isUppercase,lowercase,withoutLineReturn,withoutPunctuaction,withoutStopwords,tokenized,stemming,n-grams
0,Marc Levy (born 16 October 1961) is a French n...,10,54,6,4,marc levy (born 16 october 1961) is a french n...,marc levy (born 16 october 1961) is a french n...,marc levy born 16 october 1961 is a french n...,marc levy born 16 october 1961 french novelist,"[marc, levy, born, 16, october, 1961, french, ...","[marc, levi, born, 16, octob, 1961, french, no...","[(marc, levy), (levy, born), (born, 16), (16, ..."
1,Emmanuel Carrère (born 9 December 1957) is a F...,14,91,5,4,emmanuel carrère (born 9 december 1957) is a f...,emmanuel carrère (born 9 december 1957) is a f...,emmanuel carrère born 9 december 1957 is a f...,emmanuel carrère born 9 december 1957 french a...,"[emmanuel, carrère, born, 9, december, 1957, f...","[emmanuel, carrèr, born, 9, decemb, 1957, fren...","[(emmanuel, carrère), (carrère, born), (born, ..."
2,Franck Thilliez (born 15 October 1973 in Annec...,34,191,6,9,franck thilliez (born 15 october 1973 in annec...,franck thilliez (born 15 october 1973 in annec...,franck thilliez born 15 october 1973 in annec...,franck thilliez born 15 october 1973 annecy fr...,"[franck, thilliez, born, 15, october, 1973, an...","[franck, thilliez, born, 15, octob, 1973, anne...","[(franck, thilliez), (thilliez, born), (born, ..."
3,"Kenneth Martin Follett, (born 5 June 1949) is...",86,476,9,33,"kenneth martin follett, (born 5 june 1949) is...","kenneth martin follett, (born 5 june 1949) is...",kenneth martin follett born 5 june 1949 is...,kenneth martin follett born 5 june 1949 welsh ...,"[kenneth, martin, follett, born, 5, june, 1949...","[kenneth, martin, follett, born, 5, june, 1949...","[(kenneth, martin), (martin, follett), (follet..."
4,"Stephen Edwin King (born September 21, 1947) i...",190,1193,30,54,"stephen edwin king (born september 21, 1947) i...","stephen edwin king (born september 21, 1947) i...",stephen edwin king born september 21 1947 i...,stephen edwin king born september 21 1947 amer...,"[stephen, edwin, king, born, september, 21, 19...","[stephen, edwin, king, born, septemb, 21, 1947...","[(stephen, edwin), (edwin, king), (king, born)..."
5,Pierre Lemaitre (born 19 April 1951 in Paris) ...,110,688,22,42,pierre lemaitre (born 19 april 1951 in paris) ...,pierre lemaitre (born 19 april 1951 in paris) ...,pierre lemaitre born 19 april 1951 in paris ...,pierre lemaitre born 19 april 1951 paris prix ...,"[pierre, lemaitre, born, 19, april, 1951, pari...","[pierr, lemaitr, born, 19, april, 1951, pari, ...","[(pierre, lemaitre), (lemaitre, born), (born, ..."
6,Joël Dicker (born 1985) is a Swiss novelist.,8,44,4,3,joël dicker (born 1985) is a swiss novelist.,joël dicker (born 1985) is a swiss novelist.,joël dicker born 1985 is a swiss novelist,joël dicker born 1985 swiss novelist,"[joël, dicker, born, 1985, swiss, novelist]","[joël, dicker, born, 1985, swiss, novelist]","[(joël, dicker), (dicker, born), (born, 1985),..."
7,"Baroness Fabienne-Claire Nothomb, better known...",127,741,12,35,"baroness fabienne-claire nothomb, better known...","baroness fabienne-claire nothomb, better known...",baroness fabienne claire nothomb better known...,baroness fabienne claire nothomb better known ...,"[baroness, fabienne, claire, nothomb, better, ...","[baro, fabienn, clair, nothomb, better, known,...","[(baroness, fabienne), (fabienne, claire), (cl..."
8,"Harlan Coben (born January 4, 1962) is an Amer...",65,394,5,10,"harlan coben (born january 4, 1962) is an amer...","harlan coben (born january 4, 1962) is an amer...",harlan coben born january 4 1962 is an amer...,harlan coben born january 4 1962 american writ...,"[harlan, coben, born, january, 4, 1962, americ...","[harlan, coben, born, januari, 4, 1962, americ...","[(harlan, coben), (coben, born), (born, januar..."
9,"Dame Agatha Mary Clarissa Christie, Lady Mallo...",425,2660,77,111,"dame agatha mary clarissa christie, lady mallo...","dame agatha mary clarissa christie, lady mallo...",dame agatha mary clarissa christie lady mallo...,dame agatha mary clarissa christie lady mallow...,"[dame, agatha, mary, clarissa, christie, lady,...","[dame, agatha, mari, clarissa, christi, ladi, ...","[(dame, agatha), (agatha, mary), (mary, claris..."


### Term Frequency (TF)

In [74]:
# Let's take Stephen King's description as an example
tokenized_description = data.loc[4, 'tokenized']

In [75]:
tokenized_description

['stephen',
 'edwin',
 'king',
 'born',
 'september',
 '21',
 '1947',
 'american',
 'author',
 'horror',
 'supernatural',
 'fiction',
 'suspense',
 'crime',
 'science',
 'fiction',
 'fantasy',
 'novels',
 'books',
 'sold',
 '350',
 'million',
 'copies',
 'many',
 'adapted',
 'films',
 'television',
 'series',
 'miniseries',
 'comic',
 'books',
 'king',
 'published',
 '61',
 'novels',
 'including',
 'seven',
 'pen',
 'name',
 'richard',
 'bachman',
 'five',
 'non',
 'fiction',
 'books',
 'also',
 'written',
 'approximately',
 '200',
 'short',
 'stories',
 'published',
 'book',
 'collections',
 'king',
 'received',
 'bram',
 'stoker',
 'awards',
 'world',
 'fantasy',
 'awards',
 'british',
 'fantasy',
 'society',
 'awards',
 '2003',
 'national',
 'book',
 'foundation',
 'awarded',
 'medal',
 'distinguished',
 'contribution',
 'american',
 'letters',
 'also',
 'received',
 'awards',
 'contribution',
 'literature',
 'entire',
 'bibliography',
 '2004',
 'world',
 'fantasy',
 'award',
 'life

**Tips :** 
- Build a dictionnay `{word:count, word1:count1, ...}` 

- **/!\\** take care to not override already existing keys

In [76]:
words = {}
for word in tokenized_description:
    if word not in words.keys():
        words[word] = 1
    else:
        words[word] += 1

In [77]:
document_frequency_df = pd.DataFrame.from_dict(
    words, orient='index', columns=['wordCount']).reset_index()
document_frequency_df.columns = ['word', 'wordCount']
document_frequency_df

Unnamed: 0,word,wordCount
0,stephen,1
1,edwin,1
2,king,4
3,born,1
4,september,1
...,...,...
83,reference,1
84,high,1
85,standing,1
86,pop,1


**Tips :** use `document_frequency_df`

**Reminder :** Term Frequency is `wordCount` divided by total number of words

In [78]:
document_frequency_df['df'] = document_frequency_df['wordCount'] / document_frequency_df['wordCount'].sum()
document_frequency_df

Unnamed: 0,word,wordCount,df
0,stephen,1,0.008547
1,edwin,1,0.008547
2,king,4,0.034188
3,born,1,0.008547
4,september,1,0.008547
...,...,...,...
83,reference,1,0.008547
84,high,1,0.008547
85,standing,1,0.008547
86,pop,1,0.008547


### Inverse Document Frequency (IDF)

**Reminder :** IDF (**I**nverse **D**ocument **F**requency) is the logarithm of 1 divided by number of row in `data` that contains the word you want to calculate IDF score.

In [83]:
data['withoutStopwords']

0        marc levy born 16 october 1961 french novelist
1     emmanuel carrère born 9 december 1957 french a...
2     franck thilliez born 15 october 1973 annecy fr...
3     kenneth martin follett born 5 june 1949 welsh ...
4     stephen edwin king born september 21 1947 amer...
5     pierre lemaitre born 19 april 1951 paris prix ...
6                  joël dicker born 1985 swiss novelist
7     baroness fabienne claire nothomb better known ...
8     harlan coben born january 4 1962 american writ...
9     dame agatha mary clarissa christie lady mallow...
10    david foenkinos born 28 october 1974 french au...
11    hervé le tellier born 21 april 1957 french wri...
12    arch colson chipp whitehead born november 6 19...
13    michel bussi born 1965 04 29 29 april 1965 lou...
14    guillaume musso french pronunciation ɡijom mys...
15    brit bennett american writer based los angeles...
16    bernard werber born 1961 toulouse french scien...
17     laurent mauvignier born 1967 tours french

In [92]:
len(list(filter(lambda x: x==True, data['withoutStopwords'].str.contains('king').tolist())))

3

In [94]:
import math
document_frequency_df['idf'] = document_frequency_df['word'].apply(lambda x: math.log(1 / len(list(filter(lambda x: x==True, data['withoutStopwords'].str.contains(x).tolist())))))
document_frequency_df
                                                                   

Unnamed: 0,word,wordCount,df,idf
0,stephen,1,0.008547,0.000000
1,edwin,1,0.008547,0.000000
2,king,4,0.034188,-1.098612
3,born,1,0.008547,-3.044522
4,september,1,0.008547,-1.098612
...,...,...,...,...
83,reference,1,0.008547,0.000000
84,high,1,0.008547,-1.386294
85,standing,1,0.008547,0.000000
86,pop,1,0.008547,-0.693147


### TF-IDF

**Reminder :** TF-IDF formula is ```tf * idf```

In [96]:
document_frequency_df['TF-IDF']=document_frequency_df['idf']*document_frequency_df['df']
document_frequency_df

Unnamed: 0,word,wordCount,df,idf,TF-IDF
0,stephen,1,0.008547,0.000000,0.000000
1,edwin,1,0.008547,0.000000,0.000000
2,king,4,0.034188,-1.098612,-0.037559
3,born,1,0.008547,-3.044522,-0.026022
4,september,1,0.008547,-1.098612,-0.009390
...,...,...,...,...,...
83,reference,1,0.008547,0.000000,0.000000
84,high,1,0.008547,-1.386294,-0.011849
85,standing,1,0.008547,0.000000,0.000000
86,pop,1,0.008547,-0.693147,-0.005924


### Bag of words

In [97]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [98]:
descriptions = data['withoutStopwords'].values.tolist()

**Tips :** Use cv method to fit and transform cv on description to get countVectorizer result

In [101]:
#CountVectorizer(descriptions)
X = cv.fit_transform(descriptions)

In [102]:
X

<26x854 sparse matrix of type '<class 'numpy.int64'>'
	with 1224 stored elements in Compressed Sparse Row format>